annotate bismark @ 2:82814a8a2395 draft

added samtools 0.1.19 as dependency
author bgruening
date Wed, 21 Aug 2013 05:19:54 -0400
parents 62c6da72dd4a
children 91f07ff056ca
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/perl --
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2 use strict;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3 use warnings;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4 use IO::Handle;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5 use Cwd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6 $|++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7 use Getopt::Long;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
8
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
9
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
11
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
15 ## (at your option) any later version.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
16
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
20 ## GNU General Public License for more details.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
21
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
24
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
25
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
26 my $parent_dir = getcwd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
27 my $bismark_version = 'v0.7.12';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
29
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
31 foreach my $arg (@ARGV){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
33 $arg = '--phred64-quals';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
34 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
35 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
37
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
39
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
42 my %counting; # counting various events
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
43
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
44 my $seqID_contains_tabs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
45
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
46 foreach my $filename (@filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
47
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
49 ### resetting the counting hash and fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
50 reset_counters_and_fhs($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
51 $seqID_contains_tabs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
52
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
54 if ($filename =~ ','){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
56
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
61
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
63
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
66
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
67 ### additional variables only for paired-end alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
69
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
70 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
72 warn "Input files are in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
73
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
74 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
77
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
86 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
87 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
90
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
99 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
100
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
101 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
104 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
106 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
109 ### FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
110 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
111 warn "Input files are in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
112 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
113 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
119 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
120 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
121 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
122 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
126 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
127 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
129
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
132 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
133 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
134 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
135 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
138 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
139 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
142
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
145 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
146 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
147 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
148 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
151 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
152 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
153 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
154 elsif($pbat){ # PBAT-Seq
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
158
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
159 $fhs[0]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
160 $fhs[0]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
165 $fhs[3]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
166 $fhs[3]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
167 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
168 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
169 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
172
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
181 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
182 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
183 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
185
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
189 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
191 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
194 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
195 else{ #uncompressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
198
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
208 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
210 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
212 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
213 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
215 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
216 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
218 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
219
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
220 ### Else we are performing SINGLE-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
221 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
223 ### Initialising bisulfite conversion filenames
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
224 my ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
225
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
226
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
227 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
228 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
229 warn "Inut file is in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
230 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
234 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
239
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
240 ### Creating 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
241 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
243 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
244 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
246 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
248
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
249 ## FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
250 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
251 warn "Input file is in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
252 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
255 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
256 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
259 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
260 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
264 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
265
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
267 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
270 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
272 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
273 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
276 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
277
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
279
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
281 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
282
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
283 sub start_methylation_call_procedure_single_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
285 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
287 if ($sequence_file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
290 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
291 $filename = $sequence_file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
292 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
294 ### printing all alignments to a results file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
295 my $outfile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
296
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
297 if ($bowtie2){ # SAM format is the default for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
298 $outfile =~ s/$/_bt2_bismark.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
300 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
301 $outfile =~ s/$/_bismark.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
302 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
303 else{ # SAM is the default output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
304 $outfile =~ s/$/_bismark.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
307 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
308
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
309 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
310 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
311 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
312 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
313 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
314 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
315 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
316 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
317 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
318 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
319 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
320
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
321 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
322 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
323
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
324 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
325 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
326 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
327
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
328 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
329 my $reportfile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
330 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
331 $reportfile =~ s/$/_bt2_bismark_SE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
332 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
333 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
334 $reportfile =~ s/$/_bismark_SE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
335 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
336
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
337 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
338 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
339
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
340 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
341 my $unmapped_file = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
342 $unmapped_file =~ s/$/_unmapped_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
343 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
344 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
345 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
346 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
347 my $ambiguous_file = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
348 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
349 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
350 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
351 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
352
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
353 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
354 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
355 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
356 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
358
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
359 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
360 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
361 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
362 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
363 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
364 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
365
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
366 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
367 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
369
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
370 ### Input file is in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
371 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
372 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
374 ### Input file is in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
375 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
376 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
377 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
378 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
379
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
380 sub start_methylation_call_procedure_paired_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
381 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
382
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
383 my ($dir_1,$filename_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
384
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
385 if ($sequence_file_1 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
386 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
387 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
388 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
389 $filename_1 = $sequence_file_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
390 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
391
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
392 my ($dir_2,$filename_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
393
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
394 if ($sequence_file_2 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
395 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
396 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
397 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
398 $filename_2 = $sequence_file_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
399 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
401 ### printing all alignments to a results file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
402 my $outfile = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
403 if ($bowtie2){ # SAM format is the default Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
404 $outfile =~ s/$/_bismark_bt2_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
406 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
407 $outfile =~ s/$/_bismark_pe.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
409 else{ # SAM format is the default Bowtie 1 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
410 $outfile =~ s/$/_bismark_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
411 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
412
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
413 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
414
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
415 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
416 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
417 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
418 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
419 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
420 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
421 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
423 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
424 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
426
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
427 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
428 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
430 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
431 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
433
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
434 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
435 my $reportfile = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
436 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
437 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
439 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
440 $reportfile =~ s/$/_bismark_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
443 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
444 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
445 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
447
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
448 ### Unmapped read output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
449 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
450 my $unmapped_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
451 my $unmapped_2 = $filename_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
452 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
453 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
454 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
455 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
456 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
457 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
458
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
459 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
460 my $amb_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
461 my $amb_2 = $filename_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
462 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
463 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
464 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
465 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
466 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
467 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
468
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
469 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
470 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
471 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
472
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
473 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
474 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
475 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
476 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
477 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
478 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
479
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
480 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
481 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
482 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
483
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
484 ### Input files are in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
485 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
486 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
487 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
488 ### Input files are in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
489 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
490 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
491 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
492 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
493
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
494 sub print_final_analysis_report_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
495 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
496 ### All sequences from the original sequence file have been analysed now
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
497 ### deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
498
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
499 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
500 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
501 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
502 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
503 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
504 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
505 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
506 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
507 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
508 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
509 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
510 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
511 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
512 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
513 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
514 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
517 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
518 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
519 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
520 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
521 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
522 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
523 warn "Could not delete temporary files properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
524 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
526
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
527 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
528 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
529 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
530 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
531 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
532 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
533 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
534 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
535
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
536 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
537 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
538 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
539 my $percent_alignable_sequences;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
540
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
541 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
542 $percent_alignable_sequences = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
543 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
544 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
545 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
546 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
547
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
548 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
549 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
551 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
552 ### only calculating the percentage if there were any overruled alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
553 if ($counting{low_complexity_alignments_overruled_count}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
554 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
555 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
556 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
557
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
558 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
559 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
560 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
561 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
562 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
563
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
564 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
565 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
566 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
567 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
568 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
569
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
570 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
571 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
572 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
573 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
574
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
575 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
576 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
577 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
578 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
579 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
580 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
581 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
582 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
583 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
584 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
585
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
586 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
587 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
588 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
589 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
590 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
591 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
592 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
593 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
594
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
595 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
596 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
597 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
598 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
599
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
600 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
601 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
602 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
603 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
604
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
605 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
606 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
607 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
608 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
609
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
610 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
611 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
612 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
613 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
614 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
615 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
616 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
617 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
618 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
619
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
620 ### printing methylated C percentage (CHG context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
621 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
622 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
623 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
624 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
625 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
626 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
627 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
628 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
629
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
630 ### printing methylated C percentage (CHH context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
631 if ($percent_meCHH){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
632 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
633 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
634 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
635 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
636 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
637 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
639
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
640 if ($seqID_contains_tabs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
641 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
642 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
643 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
644 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
645
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
646 sub print_final_analysis_report_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
647 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
648 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
649 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
650 if ($G_to_A_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
651 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
652 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
653 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
654 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
655 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
656 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
657 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
658 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
659 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
660 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
661 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
662 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
663 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
664 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
665 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
666 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
667 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
668 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
669 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
670 if ($G_to_A_infile_2 and $C_to_T_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
671 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
672 if ($deletion_successful == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
673 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
674 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
675 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
676 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
678 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
679 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
680 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
681 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
682 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
683 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
684 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
685 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
687 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
688 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
689
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
690 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
691 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
692 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
693 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
694 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
695 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
696 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
697 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
698
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
699 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
700 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
701 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
702
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
703 my $percent_alignable_sequence_pairs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
704 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
705 $percent_alignable_sequence_pairs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
707 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
708 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
710 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
711 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
712
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
713 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
714 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
715 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
716 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
717 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
718
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
719
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
720 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
721 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
722 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
723 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
724 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
725 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
726
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
727 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
728 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
729 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
730 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
731
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
732 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
733 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
734
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
735 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
736 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
737 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
738 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
739 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
740 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
741 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
742 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
743
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
744 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
745 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
746 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
747 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
748 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
749 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
750 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
751
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
752 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
753 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
754 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
755 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
756
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
757 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
758 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
759 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
760 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
762 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
763 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
764 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
765 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
766
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
767 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
768 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
769 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
770 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
771 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
772 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
773 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
774 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
775 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
776
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
777 ### printing methylated C percentage in CHG context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
778 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
779 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
780 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
782 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
783 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
784 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
785 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
786
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
787 ### printing methylated C percentage in CHH context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
788 if ($percent_meCHH){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
789 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
790 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
791 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
792 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
793 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
794 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
795 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
796
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
797 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
799 sub process_single_end_fastA_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
800 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
801 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
802 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
803 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
804
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
805 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
806 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
807 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
808 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
809 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
810 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
811 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
812
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
813 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
814
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
815 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
816 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
817 # last if ($counting{sequences_count} > 100);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
818 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
819 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
820 last unless ($identifier and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
821
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
822 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
823
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
824 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
825
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
826 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
827 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
828 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
829 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
830 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
831 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
832
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
833 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
834 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
835 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
836 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
837 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
838 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
839
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
840 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
841
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
842 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
843 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
844 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
845 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
846 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
847 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
848 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
849
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
850 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
851 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
852 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
853
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
854 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
855 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
856 print AMBIG ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
857 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
858 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
860 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
861 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
862 print UNMAPPED ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
863 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
864 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
865 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
866 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
867
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
868 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
869
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
871
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
872 sub process_single_end_fastQ_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
873 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
874 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
875 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
876 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
877
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
878 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
879 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
880 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
881 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
882 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
883 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
884 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
885
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
886 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
887
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
888 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
889 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
890 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
891 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
892 my $identifier_2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
893 my $quality_value = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
894 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
895
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
896 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
897
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
898 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
900 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
901 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
902 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
903 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
904 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
905 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
906
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
907 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
908
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
909 if ($counting{sequences_count}%1000000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
910 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
911 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
912 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
913 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
914 chomp $quality_value;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
915
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
916 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
917
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
918 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
919 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
920 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
921 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
922 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
923 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
925
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
926 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
927 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
929
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
930 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
931 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
932 print AMBIG "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
933 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
934 print AMBIG $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
935 print AMBIG "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
936 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
937
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
938 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
939 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
940 print UNMAPPED "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
941 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
942 print UNMAPPED $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
943 print UNMAPPED "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
944 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
945 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
946 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
947
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
948 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
949
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
950 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
951
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
952 sub process_fastA_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
953 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
954 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
955 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
956 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
957 ### converted genomes (either the C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
958
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
959 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
960 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
961 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
962 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
963 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
964 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
965 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
966 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
967 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
968
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
969 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
970 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
971
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
972 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
973
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
974 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
975 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
976 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
977 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
978 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
979 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
980 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
981 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
982
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
983 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
984 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
985
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
986 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
987
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
988 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
989 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
990 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
991 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
992 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
993 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
994
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
995 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
996 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
997 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
998 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
999 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1000 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1001
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1002 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1003 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1004 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1005 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1006
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1007 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1008
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1009 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1010 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1011 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1012 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1013 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1014 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1015 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1016
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1017 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1018 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1019 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1020
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1021 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1022 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1023 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1024 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1025 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1026 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1027 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1028
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1029 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1030 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1031 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1032 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1033 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1034 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1035 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1036 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1037
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1038 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1039
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1040 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1041
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1042 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1043
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1044 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1045
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1046 sub process_fastQ_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1047 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1048 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1049 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1050 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1051 ### of the converted genomes (either C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1052
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1053 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1054 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1055 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1056 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1057 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1058 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1059 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1060 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1061 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1062
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1063 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1064
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1065 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1066 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1067 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1068 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1069 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1070 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1071 my $ident_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1072 my $quality_value_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1073 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1074 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1075 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1076 my $ident_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1077 my $quality_value_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1078 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1079
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1080 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1081 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1082
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1083 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1084
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1085 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1086 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1087 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1088 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1089 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1090 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1091
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1092 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1093 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1094 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1095 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1096
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1097 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1098 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1099
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1100 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1101 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1102 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1103 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1104 chomp $quality_value_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1105 chomp $quality_value_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1106
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1107 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1109 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1110 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1111 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1112 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1113 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1114 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1115 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1117 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1118 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1119 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1120
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1121 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1122 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1123 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1124 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1125 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1126 print AMBIG_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1127 print AMBIG_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1128 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1129 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1130 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1131 print AMBIG_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1132 print AMBIG_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1133 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1135 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1136 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1137 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1138 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1139 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1140 print UNMAPPED_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1141 print UNMAPPED_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1142 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1143 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1144 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1145 print UNMAPPED_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1146 print UNMAPPED_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1147 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1148 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1149
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1150 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1151
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1152 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1153
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1154 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1156 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1157
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1158 sub check_bowtie_results_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1159 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1160
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1161 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1162 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1163 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1164
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1165 my %mismatches = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1166 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1167 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1168
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1169 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1170 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1171 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1172 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1173 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1174 ### STEP I Now processing the alignment stored in last_line ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1175 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1176 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1177 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1178 ### we only continue to extract useful information about this alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1179 if ($valid_alignment_found_1 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1180 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1181 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1182 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1183
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1184 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1185 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1186 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1187
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1188 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1189 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1190 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1191 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1193 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1194 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1195 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1196 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1197 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1198 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1199 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1200 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1201 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1202 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1203 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1205 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1206 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1208 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1209 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1210 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1211 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1212 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1213 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1214 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1215 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1216 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1217 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1218 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1219 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1220 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1221 $number_of_mismatches = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1222 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1223 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1224 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1225 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1226 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1227 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1228 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1229 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1230 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1231 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1232 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1233 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1234 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1235 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1236 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1237 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1239 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1240 ### we only continue to extract useful information about this second alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1241 if ($valid_alignment_found_2 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1242 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1243 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1244 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1245 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1246 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1248 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1249
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1250 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1251 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1252 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1253 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1254 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1255 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1256 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1257
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1258 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1259 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1260 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1261 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1262 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1263 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1264 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1265 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1266 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1267 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1268 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1270 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1271 ### extracting the chromosome number from the bowtie output (see above)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1272 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1273 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1274 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1275 ### case we are not writing the same entry out a second time.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1276 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1277 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1278 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1279 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1280 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1281 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1282 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1283 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1284 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1285 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1286 $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1287 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1288 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1289 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1290 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1291 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1292 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1293 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1294 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1295 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1296 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1297 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1298 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1300 ### still within the 2nd sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1301 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1302 ### still withing the 1st sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1303 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1304 ### still within the if (last_seq_id eq identifier) condition
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1306 ### still within foreach index loop
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1307 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1308 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1309 unless(%mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1310 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1311 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1312 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1313 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1314 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1315 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1316 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1317 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1318 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1319 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1320 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1321 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1322 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1323 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1324 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1325 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1326 my $sequence_fails = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1327 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1328 my $methylation_call_params; # hash reference!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1329 ### sorting in ascending order
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1330 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1331
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1332 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1333 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1334 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1335 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1336 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1337 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1338 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1339 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1341 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1342 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1343 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1344 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1345 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1346 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1347 ### reaction. E.g.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1348 ### CAGTCACGCGCGCGCG will become
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1349 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1350 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1351 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1352 ### G->A conversion:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1353 ### highly methylated: CAATCACACACACACA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1354 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1355 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1356 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1357 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1358 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1359 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1360 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1361 ### In the above example the number of transliterations required to transform the actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1362 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1363 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1364 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1365 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1366 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1367 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1368 my @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1369 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1370 my $transliterations_performed;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1371 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1372 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1374 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1375 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1376 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1377 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1378 die "unexpected index number range $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1380 push @three_candidate_seqs,{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1381 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1382 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1383 mismatch_number => $mismatch_number,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1384 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1385 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1386 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1387 transliterations_performed => $transliterations_performed,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1388 };
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1389 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1390 ### sorting in ascending order for the lowest number of transliterations performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1391 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1392 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1393 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1394 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1395 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1396 if (($first_array_element*2) < $second_array_element){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1397 $counting{low_complexity_alignments_overruled_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1398 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1399 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1400 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1401 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1402 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1403 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1404 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1406 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1407 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1409 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1410 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1411 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1412 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1413 ### after processing the alignment with the lowest number of mismatches we exit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1414 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1415 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1416 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1417 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1418 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1419 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1420 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1421 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1422 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1423 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1424 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1425 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1426 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1427 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1428 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1430 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1431 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1432 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1433 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1434 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1435 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1436 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1437 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1439 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1440
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1441 ### If the sequence has not been rejected so far it will have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1442 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1443 if ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1444 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1446 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1447 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1448 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1449
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1450 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1451 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1452 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1453 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1454 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1455 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1456
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1457 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1458 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1459
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1460 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1461 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1462 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1463
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1464 sub check_bowtie_results_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1465 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1466
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1467
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1468 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1469 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1471
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1472 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1473 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1474 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1475
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1476 my $alignment_ambiguous = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1477
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1478 my %alignments = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1479
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1480 ### reading from the Bowtie 2 output filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1481 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1482 # print "Index: $index\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1483 # print "$fhs[$index]->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1484 # print "$fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1485 # sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1486 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1487 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1488
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1489 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1490 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1491
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1492 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1493 # SAM format specifications for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1494 # (1) Name of read that aligned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1495 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1496 # 1 The read is one of a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1497 # 2 The alignment is one end of a proper paired-end alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1498 # 4 The read has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1499 # 8 The read is one of a pair and has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1500 # 16 The alignment is to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1501 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1502 # 64 The read is mate 1 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1503 # 128 The read is mate 2 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1504 # 256 The read has multiple mapping states
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1505 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1506 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1507 # (5) Mapping quality (255 means MAPQ is not available)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1508 # (6) CIGAR string representation of alignment (* if unavailable)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1509 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1510 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1511 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1512 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1513 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1514 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1515 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1516 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1517 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1518 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1519 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1520 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1521 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1522 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1523 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1524 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1525
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1526 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1527
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1528 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1529 if ($flag == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1530 ## reading in the next alignment, which must be the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1531 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1532 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1533 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1534 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1535 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1536 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1537 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1538 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1540 next; # next instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1541 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1542 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1543 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1544 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1545 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1546 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1547 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1548 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1549
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1550 # if there are one or more proper alignments we can extract the chromosome number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1551 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1552 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1553 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1554 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1555 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1556 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1557 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1558
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1559 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1560 my ($alignment_score,$second_best,$MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1561 my @fields = split (/\t/,$fhs[$index]->{last_line});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1562
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1563 foreach (11..$#fields){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1564 if ($fields[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1565 $alignment_score = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1566 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1567 elsif ($fields[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1568 $second_best = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1569 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1570 elsif ($fields[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1571 $MD_tag = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1572 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1573 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1574
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1575 # warn "First best alignment_score is: '$alignment_score'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1576 # warn "MD tag is: '$MD_tag'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1577 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1578
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1579 if (defined $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1580 # warn "second best alignment_score is: '$second_best'\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1581
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1582 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1583 if ($alignment_score == $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1584 $alignment_ambiguous = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1585 ## need to read and discard all additional ambiguous reads until we reach the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1586 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1587 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1588 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1589 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1590 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1591 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1592 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1593 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1594 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1595 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1596 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1597 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1598 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1599 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1600 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1601 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1602 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1603 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1604
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1605 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1606
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1607 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1608 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1609 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1610 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1611
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1612 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1613 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1614 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1615 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1616 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1617 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1618 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1619 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1620 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1621 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1622
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1623 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1624 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1625 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1626 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1627 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1628 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1629 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1630 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1631 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1632 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1633 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1634 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1635 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1636 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1637 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1639 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1640 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1641 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1642 else{ # there is no second best hit, so we can just store this one and read in the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1643
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1644 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1645
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1646 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1647 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1648 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1649 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1650
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1651 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1652 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1653 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1654 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1655 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1656 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1657 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1658 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1659 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1660 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1661
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1662 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1663 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1664 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1665 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1666 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1667 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1668 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1669 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1670 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1671 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1672 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1673 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1674 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1675 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1676 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1678 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1679 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1680
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1681 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1682 if ($alignment_ambiguous == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1683 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1684 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1685 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1686 # print "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1688 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1689 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1690 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1691 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1692 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1693 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1694 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1695 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1697 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1698
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1699 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1700 unless(%alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1701 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1702 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1703 # print "$unmapped_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1704 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1705 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1707 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1708 return 0; # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1710 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1711
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1712 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1713
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1714 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1715 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1716 ### alignment score we are discarding the sequence altogether.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1717 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1718 ### opening (5) and extending (3 per bp) the gap.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1719
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1720 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1721
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1722 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1723 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1724
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1725 ### print contents of %alignments for debugging
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1726 # if (scalar keys %alignments > 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1727 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1728 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1729 # print "Loc: $alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1730 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1731 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1732 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1733 # print "Index $alignments{$alignment_location}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1734 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1735 # print "pos: $alignments{$alignment_location}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1736 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1737 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1738 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1739 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1740
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1741 ### if there is only 1 entry in the hash with we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1742 if (scalar keys %alignments == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1743 for my $unique_best_alignment (keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1744 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1745 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1746 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1747 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1748 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1749 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1750 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1751 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1752 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1753
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1754 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1755 ### we boot the sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1756 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1757 my $best_alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1758 my $best_alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1759 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1760 # print "$alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1761 unless (defined $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1762 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1763 $best_alignment_location = $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1764 # print "setting best alignment score: $best_alignment_score\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1765 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1766 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1767 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1768 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1769 # warn "Same alignment score, the sequence will get booted!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1770 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1771 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1772 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1773 ### else we are going to store the best alignment for further processing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1774 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1775 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1776 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1777 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1778 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1779 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1780 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1781 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1782 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1783 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1784 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1785 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1786 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1787 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1788 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1789 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1790
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1791 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1792 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1793 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1794
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1795 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1796 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1797 # print OUT "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1799 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1800 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1801 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1802 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1803 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1804 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1805 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1806 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1807 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1808 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1809
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1810 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1811 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1812 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1813 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1814 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1815 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1816 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1817 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1818 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1819 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1820
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1821 ### If the sequence has not been rejected so far it has a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1822 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1823
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1824 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1825 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1826
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1827 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1828 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1829 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1830 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1831 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1832 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1833
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1834
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1835 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1836 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1837 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1838 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1839 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1840
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1841
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1842 sub determine_number_of_transliterations_performed{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1843 my ($sequence,$read_conversion) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1844 my $number_of_transliterations;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1845 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1846 $number_of_transliterations = $sequence =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1847 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1848 elsif ($read_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1849 $number_of_transliterations = $sequence =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1850 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1851 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1852 die "Read conversion mode of the read was not specified $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1853 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1854 return $number_of_transliterations;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1855 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1856
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1857 sub decide_whether_single_end_alignment_is_valid{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1858 my ($index,$identifier) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1860 # extracting from Bowtie 1 format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1861 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1862
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1863 ### ensuring that the entry is the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1864 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1865 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1866 ### sensible alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1867 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1868 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1869 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1870 return 1; ### 1st possibility for a sequence to pass
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1871 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1872 ### If the alignment was in the wrong orientation we need to read in a new line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1873 elsif($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1874 my $newline = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1875 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1876 ($id,$strand) = (split (/\t/,$newline))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1877
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1878 ### ensuring that the next entry is still the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1879 if ($id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1880 ### checking orientation again
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1881 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1882 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1883 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1884 $fhs[$index]->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1885 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1886 return 1; ### 2nd possibility for a sequence to pass
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1887 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1888 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1889 elsif ($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1890 $newline = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1891 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1892 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1893 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1894 ### the same fields of the just read next entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1895 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1896 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1897 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1898 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1899 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1900 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1901 # assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1902 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1903 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1904 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1905 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1906 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1907 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1908 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1909 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1910 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1911 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1912 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1913 $fhs[$index]->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1914 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1915 return 0; # processing the new alignment result only in the next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1916 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1917 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1918 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1919 # assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1920 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1921 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1922 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1923 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1925 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1926 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1927 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1929 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1930 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1931 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1932 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1933 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1934 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1935 ### BOWTIE 1 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1936 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1937
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1938 sub check_bowtie_results_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1939 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1940
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1941 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1942 unless ($quality_value_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1943 $quality_value_1 = 'I'x(length$sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1944 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1945 unless ($quality_value_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1946 $quality_value_2 = 'I'x(length$sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1947 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1948
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1949 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1950 # sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1951 my %mismatches = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1952 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1953
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1954
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1955 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1956 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1957 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1958 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1959 ### strands are not being reported by specifying --directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1960
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1961 foreach my $index (0,3,1,2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1962 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1963 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1964 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1965 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1966 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1967
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1968 ##################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1969 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1970 ##################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1971 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1972 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1973 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1974 if ($valid_alignment_found == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1975 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1976 ### we store the useful information in %mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1977 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1978 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1979 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1980 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1981
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1982 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1983 my ($chromosome_1,$chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1984 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1985 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1986 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1987 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1988 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1989 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1990 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1991 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1992 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1993 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1994 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1995 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1996
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1997 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1998 my $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1999 my $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2000 if ($mismatch_info_1 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2001 $number_of_mismatches_1 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2002 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2003 elsif ($mismatch_info_1 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2004 my @mismatches = split (/,/,$mismatch_info_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2005 $number_of_mismatches_1 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2006 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2007 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2008 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2009 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2010 if ($mismatch_info_2 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2011 $number_of_mismatches_2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2012 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2013 elsif ($mismatch_info_2 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2014 my @mismatches = split (/,/,$mismatch_info_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2015 $number_of_mismatches_2 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2016 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2017 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2018 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2019 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2020 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2021 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2022 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2023 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2024 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2025 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2026 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2027 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2028 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2029 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2030 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2031 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2032 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2033 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2034 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2035 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2036 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2037 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2038 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2039 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2040 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2041 ###################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2042 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2043 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2044 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2045 ### this round ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2046 ###################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2047 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2048 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2049
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2050 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2051 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2052 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2053
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2054 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2055 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2056 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2057 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2058 $fhs[$index]->{last_seq_id} = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2059 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2060 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2061 die "Either read 1 or read 2 needs to end on '/1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2062 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2063
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2064 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2065 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2066 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2067 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2068 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2069 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2070 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2071 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2072 next; # jumping to the next index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2073 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2074 ### Now processing the entry we just stored in last_line_1 and last_line_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2075 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2076 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2077 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2078 if ($valid_alignment_found == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2079 ### we store the useful information in %mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2080 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2081 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2082 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2083 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2084 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2085 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2086 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2087 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2088 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2089 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2090 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2091 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2092 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2093 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2094 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2095 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2096 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2097
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2098 $number_of_mismatches_1='';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2099 $number_of_mismatches_2='';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2100 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2101 if ($mismatch_info_1 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2102 $number_of_mismatches_1 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2104 elsif ($mismatch_info_1 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2105 my @mismatches = split (/,/,$mismatch_info_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2106 $number_of_mismatches_1 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2108 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2109 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2110 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2111 if ($mismatch_info_2 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2112 $number_of_mismatches_2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2113 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2114 elsif ($mismatch_info_2 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2115 my @mismatches = split (/,/,$mismatch_info_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2116 $number_of_mismatches_2 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2117 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2118 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2119 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2120 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2121 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2122 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2123 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2124 die "position 1 is greater than position 2" if ($position_1 > $position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2125 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2126 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2127 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2128 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2129 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2130 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2131 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2132 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2133 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2134 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2135 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2136 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2137 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2138 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2139 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2140 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2141 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2142 ###############################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2143 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2144 ###############################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2145 $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2146 $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2147
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2148 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2149 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2150 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2151
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2152 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2153 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2154 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2155 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2156 $fhs[$index]->{last_seq_id} = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2157 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2158 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2159 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2160 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2161 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2162 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2163 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2164 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2165 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2166 next; # jumping to the next index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2167 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2168 ### within the 2nd sequence pair alignment in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2169 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2170 ### within the 1st sequence pair alignment in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2171 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2172 ### still within the (last_seq_id eq identifier) condition
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2173 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2174 ### still within foreach index loop
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2175 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2176 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2177 unless(%mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2178 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2179 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2180 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2181 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2182 my $sequence_pair_fails = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2183 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2184 my $methylation_call_params; # hash reference!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2185 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2186 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2187 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2188 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2189 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2190 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2191 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2192 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2193 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2194 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2195 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2196 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2197 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2198 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2199 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2200 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2201 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2202 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2203 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2204 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2205 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2206 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2208 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2209 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2210 $sequence_pair_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2211 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2212 ### after processing the alignment with the lowest number of mismatches we exit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2213 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2214 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2215 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2216 if ($sequence_pair_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2217 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2218 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2219 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2220 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2221 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2222 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2223 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2224 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2225 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2226 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2228
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2229 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2230 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2231 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2232 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2233 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2234 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2235 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2236 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2237 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2239
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2240 ### If the sequence has not been rejected so far it does have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2241 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2242 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2243
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2244 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2245 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2246 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2247 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2248 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2249 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2250 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2251 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2252 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2253 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2254 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2255
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2256 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2257 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2258 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2259
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2260 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2261 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2262 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2263
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2264 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2265 ### BOWTIE 2 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2266 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2267
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2268 sub check_bowtie_results_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2269 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2270
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2271 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2272 unless ($quality_value_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2273 $quality_value_1 = 'I'x(length$sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2274 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2275
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2276 unless ($quality_value_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2277 $quality_value_2 = 'I'x(length$sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2278 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2279
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2280
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2281 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2282
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2283
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2284 my %alignments;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2285 my $alignment_ambiguous = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2287 ### reading from the Bowtie 2 output filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2288
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2289 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2290 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2291 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2292 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2293 ### strands are not being reported when '--directional' is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2294
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2295 foreach my $index (0,3,1,2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2296 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2297 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2298
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2299 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2300 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2301
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2302 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2303 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2304 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2305 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2306 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2307 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2308 $id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2309 $id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2310
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2311 # SAM format specifications for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2312 # (1) Name of read that aligned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2313 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2314 # 1 The read is one of a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2315 # 2 The alignment is one end of a proper paired-end alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2316 # 4 The read has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2317 # 8 The read is one of a pair and has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2318 # 16 The alignment is to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2319 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2320 # 64 The read is mate 1 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2321 # 128 The read is mate 2 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2322 # 256 The read has multiple mapping states
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2323 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2324 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2325 # (5) Mapping quality (255 means MAPQ is not available)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2326 # (6) CIGAR string representation of alignment (* if unavailable)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2327 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2328 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2329 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2330 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2331 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2332 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2333 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2334 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2335 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2336 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2337 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2338 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2339 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2340 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2341 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2342 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2343
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2344 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2345 ### We can store the next alignment and move on to the next Bowtie 2 instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2346 if ($flag_1 == 77 and $flag_2 == 141){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2347 ## reading in the next alignment, which must be the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2348 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2349 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2350
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2351 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2352 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2353 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2354 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2355 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2356 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2357 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2358 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2359 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2360 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2361
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2362 # print "current sequence ($identifier) did not map, reading in next sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2363 # print "$index\t$fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2364 # print "$index\t$fhs[$index]->{last_line_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2365 # print "$index\t$fhs[$index]->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2366 next; # next instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2367 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2368 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2369 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2370 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2371 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2372 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2373 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2374 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2375 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2376
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2377 ### If there are one or more proper alignments we can extract the chromosome number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2378 my ($chromosome_1,$chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2379 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2380 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2381 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2382 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2383 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2384 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2385 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2386 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2387 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2388 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2389 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2390 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2391
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2392 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2393
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2394 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2395 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2396
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2397 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2398 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2399
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2400 foreach (11..$#fields_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2401 if ($fields_1[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2402 $alignment_score_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2403 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2404 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2405 $second_best_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2406 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2407 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2408 $MD_tag_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2409 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2410 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2411
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2412 foreach (11..$#fields_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2413 if ($fields_2[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2414 $alignment_score_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2415 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2416 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2417 $second_best_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2418 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2419 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2420 $MD_tag_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2421 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2423
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2424 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2425 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2426
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2427 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2428 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2429 # warn "MD tag 1 is: '$MD_tag_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2430 # warn "MD tag 2 is: '$MD_tag_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2431
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2432 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2433 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2434 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2435
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2436 if (defined $second_best_1 and defined $second_best_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2437 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2438 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2439 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2440 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2441
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2442 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2443 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2444 $alignment_ambiguous = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2445 # print "This read will be chucked (AS==XS detected)!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2447 ## need to read and discard all additional ambiguous reads until we reach the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2448 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2449 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2450 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2451 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2452 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2453 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2454 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2455 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2456 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2457 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2458 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2459
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2460 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2461 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2462 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2463 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2464 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2465 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2466 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2467 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2468 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2469 last; # break free if the end of the alignment output was reached
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2471 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2472 # if ($fhs[$index]->{last_seq_id}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2473 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2474 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2475 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2476 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2477
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2478 my $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2479 if ($position_1 <= $position_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2480 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2481 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2482 elsif($position_2 < $position_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2483 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2484 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2485
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2486 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2487 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2488 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2489 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2490
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2491 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2492 $alignments{$alignment_location}->{seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2493 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2494 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2495 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2496 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2497 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2498 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2499 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2500 $alignments{$alignment_location}->{position_1} = $position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2501 $alignments{$alignment_location}->{position_2} = $position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2502 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2503 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2504 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2505 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2506 $alignments{$alignment_location}->{flag_1} = $flag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2507 $alignments{$alignment_location}->{flag_2} = $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2508 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2509 # warn "added best of several alignments to \%alignments hash\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2510
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2511 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2512 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2513 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2514 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2515 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2516 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2517 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2518 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2519 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2520 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2521 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2522 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2523
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2524 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2525 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2526 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2527 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2528 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2529 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2530 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2531 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2532 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2533 last; # break free if the end of the alignment output was reached
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2534 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2535 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2536 # if($fhs[$index]->{last_seq_id}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2537 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2538 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2541 else{ # there is no second best hit, so we can just store this one and read in the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2542
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2543 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2544 # print "$alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2545 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2546 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2547 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2548 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2549
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2550 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2551 $alignments{$alignment_location}->{seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2552 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2553 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2554 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2555 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2556 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2557 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2558 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2559 $alignments{$alignment_location}->{position_1} = $position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2560 $alignments{$alignment_location}->{position_2} = $position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2561 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2562 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2563 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2564 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2565 $alignments{$alignment_location}->{flag_1} = $flag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2566 $alignments{$alignment_location}->{flag_2} = $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2567 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2568
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2569 # warn "added unique alignment to \%alignments hash\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2570
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2571 # Now reading and storing the next read pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2572 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2573 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2574 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2575 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2576 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2577 # print "$newline_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2578 # print "$newline_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2579 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2580 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2581 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2582 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2583 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2584
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2585 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2586 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2587 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2588
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2589 if ($seq_id_1 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2590 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2592 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2593 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2594 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2595 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2596 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2597 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2598 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2599 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2600 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2601 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2602
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2603 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2604 if ($alignment_ambiguous == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2605 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2606 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2607 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2608 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2609 # print "$ambiguous_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2610 # print "$ambiguous_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2611
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2612 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2613 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2614 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2615 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2616 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2617 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2618 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2619 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2620 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2621 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2622
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2623 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2624 unless (%alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2625 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2626
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2627 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2628 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2629 # print "$unmapped_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2630 # print "$unmapped_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2631 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2632 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2633 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2634 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2635 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2636 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2637 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2638
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2639 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2640
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2641 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2642 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2643 ### alignment score we are discarding the sequence pair altogether.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2644 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2645 ### and extending (3 per bp) the gap.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2646
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2647 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2648
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2649 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2650 my $methylation_call_params; # hash reference
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2651 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2652
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2653 ### print contents of %alignments for debugging
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2654 ## if (scalar keys %alignments >= 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2655 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2656 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2657 # print "Loc: $alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2658 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2659 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2660 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2661 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2662 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2663 # print "Index $alignments{$alignment_location}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2664 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2665 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2666 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2667 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2668 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2669 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2670 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2671 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2672 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2673 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2674 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2675 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2676
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2677 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2678 if (scalar keys %alignments == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2679 for my $unique_best_alignment (keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2680 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2681 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2682 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2683 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2684 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2685 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2686 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2687 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2688 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2689 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2690 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2691 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2692 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2693 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2694 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2695 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2697
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2698 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2699 ### we boot the sequence pair altogether)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2700 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2701 my $best_sum_of_alignment_scores;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2702 my $best_alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2703 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2704 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2705 unless (defined $best_sum_of_alignment_scores){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2706 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2707 $best_alignment_location = $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2708 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2710 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2711 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2712 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2713 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2714 $sequence_pair_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2715 last; # exiting since we know that the sequence has ambiguous alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2716 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2717 ### else we are going to store the best alignment for further processing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2718 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2719 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2720 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2721 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2722 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2723 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2724 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2725 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2726 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2727 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2728 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2729 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2730 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2731 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2732 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2733 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2734 last; # exiting since the sequence produced a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2735 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2736 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2737 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2738 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2739 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2740 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2741 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2742
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2743 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2744 if ($sequence_pair_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2745 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2746
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2747 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2748 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2749 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2750 # print "$ambiguous_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2751 # print "$ambiguous_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2752
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2753 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2754 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2755 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2756 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2757 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2758 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2759 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2760 return 0; # => exits to next sequence pair (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2761 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2762 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2763
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2764 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2765 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2766 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2767 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2768 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2769 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2770 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2771 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2772 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2773 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2774
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2775 ### If the sequence pair has not been rejected so far it does have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2776 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2777 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2778
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2779 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2780 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2781 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2782 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2783 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2784 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2785 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2786 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2787 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2788 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2789 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2790
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2791 ### now we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2792 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2793 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2794 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2795 # print " $sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2796 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2797 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2799 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2800 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2801 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2802
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2803 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2804
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2805 sub decide_whether_paired_end_alignment_is_valid{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2806 my ($index,$identifier) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2807 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2808 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2809 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2810 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2811 my $seq_id_1 = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2812 my $seq_id_2 = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2813 $seq_id_1 =~ s/\/1$//; # removing the read /1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2814 $seq_id_2 =~ s/\/1$//; # removing the read /1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2815
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2816 ### ensuring that the current entry is the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2817 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2818 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2819 ### sensible alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2820 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2821 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2822 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2823 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2824 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2825 ### If the alignment was in the wrong orientation we need to read in two new lines
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2826 elsif($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2827 my $newline_1 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2828 my $newline_2 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2829 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2830 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2831 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2832 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2833
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2834 my $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2835 $seq_id_1 = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2836 $seq_id_2 = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2837 # we need to capture the first read (ending on /1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2838 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2839 $seqid = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2840 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2841 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2842 $seqid = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2844 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2845 die "One of the two reads needs to end on /1!!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2846 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2847
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2848 ### ensuring that the next entry is still the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2849 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2850 ### checking orientation again
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2851 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2852 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2853 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2854 ### Writing the current sequence to last_line_1 and last_line_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2855 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2856 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2857 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2858 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2859 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2860 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2861 ### the next entry)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2862 elsif ($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2863 $newline_1 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2864 $newline_2 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2865 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2866 ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2867 ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2868
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2869 $seqid = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2870 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2871 $seqid = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2872 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2873 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2874 $seqid = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2875 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2876 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2877 die "One of the two reads needs to end on /1!!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2878 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2879
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2880 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2881 ### the same fields of the just read next entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2882 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2883 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2884 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2885 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2886 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2887 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2888 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2889 ### assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2890 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2891 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2892 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2893 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2894 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2895 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2896 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2897 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2898 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2899 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2900 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2901 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2902 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2903 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2904 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2905 return 0; # processing the new alignment result only in the next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2906 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2907 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2908 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2909 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2910 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2911 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2912 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2913 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2915 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2916 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2917 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2918 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2919 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2920 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2921 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2922 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2923 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2925
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2926 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2927
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2928 sub extract_corresponding_genomic_sequence_paired_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2929 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2930 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2931 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2932 my $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2933 my $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2934 my $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2935 my $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2936 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2937
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2938 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2939 ### if the C happens to be at the first or last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2940 my $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2941 my $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2942
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2943 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2944 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2945 ### sequences around!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2946 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2947 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2948 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2949 $counting{CT_GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2950 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2951 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2952 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2953 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2954 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2955 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2956 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2957
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2958 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2959
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2960 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2961 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2962 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2963
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2964 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2965 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2966 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2967 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2968 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2969 $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2970 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2971 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2972
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2973 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2974 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2975 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2976 $counting{GA_CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2977 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2978 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2979 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2980 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2981 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2982
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2983 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2984 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2985 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2986 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2987 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2988 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2989 $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2990 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2991
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2992 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2993 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2994 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2995 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2996 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2997 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2998
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2999 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3000 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3001 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3002 $counting{GA_CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3003 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3004 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3005 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3006 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3007 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3008
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3009 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3010 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3011 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3012 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3013 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3014 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3015
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3016 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3017 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3018 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3019 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3020 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3021 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3022 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3023 $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3024 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3025 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3026
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3027 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3028 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3029 ### [Index 3, sequence originated from the (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3030 $counting{CT_GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3031 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3032 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3033 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3034 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3035 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3036
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3037 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3038 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3039 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3040 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3041 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3042 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3043 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3044 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3045 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3046 $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3047 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3048
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3049 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3050 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3051 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3052 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3053 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3054 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3055 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3056 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3057 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3058 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3059
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3060 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3061 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3062 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3063 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3064 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3065 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3066 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3067 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3068
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3069 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3070
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3071 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3072 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3073 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3074 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3075
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3076 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3077 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3078 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3079 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3080 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3081 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3082 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3083
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3084 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3085 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3086 my $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3087 my $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3088 my $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3089 my $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3090 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3091
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3092 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3093 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3094 my $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3095 my $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3096
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3097 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3098 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3099 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3100
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3101 # parsing CIGAR 1 string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3102 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3103 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3104 shift @ops_1; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3105 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3106 # parsing CIGAR 2 string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3107 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3108 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3109 shift @ops_2; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3110 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3111
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3112 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3113 my $indels_2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3114
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3115 ### Extracting read 1 genomic sequence ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3117 # extracting 2 additional bp at the 5' end (read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3118 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3119 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3120 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3121 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3122 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3123 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3124 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3126
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3127 foreach (0..$#len_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3128 if ($ops_1[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3129 # extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3130 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3131 # warn "$non_bisulfite_sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3132 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3133 $pos_1 += $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3134 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3135 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3136 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3137 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3138 # warn "$non_bisulfite_sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3139 # position doesn't need adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3140 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3141 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3142 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3143 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3144 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3145 $pos_1 += $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3146 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3147 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3148 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3149 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3150 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3151 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3152 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3153 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3154 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3156 ### 3' end of read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3157 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3158 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3159 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3160 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3161 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3162 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3163 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3164 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3165
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3166
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3167 ### Extracting read 2 genomic sequence ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3168
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3169 ### 5' end of read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3170 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3171 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3172 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3173 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3174 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3175 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3176 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3177 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3178
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3179 foreach (0..$#len_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3180 if ($ops_2[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3181 # extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3182 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3183 # warn "$non_bisulfite_sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3184 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3185 $pos_2 += $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3186 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3187 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3188 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3189 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3190 # warn "$non_bisulfite_sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3191 # position doesn't need adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3192 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3193 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3194 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3195 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3196 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3197 $pos_2 += $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3198 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3199 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3200 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3201 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3202 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3203 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3204 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3205 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3206 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3207
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3208 ### 3' end of read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3209 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3210 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3211 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3212 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3213 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3214 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3215 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3216 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3217
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3218 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3219 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3221 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3222 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3223 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3224 $counting{CT_GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3225 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3226 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3227 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3228 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3229 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3230 ### Read 1 is always the forward hit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3231 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3232 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3234
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3235 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3236 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3237 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3238 $counting{GA_CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3239 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3240 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3241 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3242 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3243 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3244 ### Read 1 is always the forward hit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3245 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3246 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3248
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3249 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3250 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3251 ### [Index 2, sequence originated from the complementary to (converted) top strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3252 $counting{GA_CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3253 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3254 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3255 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3256 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3257 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3258
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3259 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3260 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3261 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3262
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3263 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3264 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3265 ### [Index 3, sequence originated from the (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3266 $counting{CT_GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3267 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3268 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3269 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3270 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3271 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3272 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3273 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3274 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3275 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3276 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3277 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3278 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3279 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3280
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3281 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3282 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3283 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3284 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3285 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3286 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3287 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3288 ## the end position of a read is stored in $pos
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3289 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3290 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3291 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3292 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3293 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3294
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3295 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3296 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3297 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3298
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3299 sub print_bisulfite_mapping_result_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3300 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3301
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3302 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3303 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3304 $quality_value = convert_phred64_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3306 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3307 $quality_value = convert_solexa_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3308 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3309
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3310 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3311 $methylation_call_params->{$identifier}->{position} += 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3312
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3313 ### writing every uniquely mapped read and its methylation call to the output file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3314 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3315 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3316 print OUT "$bowtie1_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3317 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3318 else{ # SAM output, default since Bismark v1.0.0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3319 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3320 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3321 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3322
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3323 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3324 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3325 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3326
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3327 sub print_bisulfite_mapping_result_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3328 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3329
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3330 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3331 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3332 $quality_value = convert_phred64_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3333 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3334 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3335 $quality_value = convert_solexa_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3336 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3337
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3338 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3339 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3341
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3342 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3343 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3344 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3345
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3346 sub print_bisulfite_mapping_results_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3347 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3348
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3349 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3350 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3351 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3352 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3353 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3354 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3355 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3356 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3357 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3358
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3359 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3360 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3361
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3362 ### writing every single aligned read and its methylation call to the output file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3363 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3364 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3365 print OUT "$bowtie1_output_paired_end\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3366 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3367 else{ # SAM output, default since Bismark v1.0.0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3368 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3369 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3370
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3371 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3372
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3373 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3374 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3375 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3376
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3377 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3378 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3379
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3380 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3381 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3382 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3383 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3384 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3385 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3386 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3387 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3388 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3389
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3390 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3391 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3392
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3393 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3394
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3395
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3396 sub convert_phred64_quals_to_phred33{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3397
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3398 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3399 my @quals = split (//,$qual);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3400 my @new_quals;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3401
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3402 foreach my $index (0..$#quals){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3403 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3404 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3405 $new_quals[$index] = $phred33_quality_string;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3406 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3407
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3408 my $phred33_quality = join ("",@new_quals);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3409 return $phred33_quality;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3410 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3411
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3412 sub convert_solexa_quals_to_phred33{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3413
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3414 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3415 my @quals = split (//,$qual);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3416 my @new_quals;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3417
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3418 foreach my $index (0..$#quals){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3419 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3420 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3421 $new_quals[$index] = $phred33_quality_string;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3423
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3424 my $phred33_quality = join ("",@new_quals);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3425 return $phred33_quality;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3426 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3427
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3428 sub convert_phred_score_into_phred33_quality_string{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3429 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3430 $qual = chr($qual+33);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3431 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3433
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3434 sub convert_phred64_quality_string_into_phred_score{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3435 my $string = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3436 my $qual = ord($string)-64;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3437 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3439
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3440 sub convert_solexa_pre1_3_quality_string_into_phred_score{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3441 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3442 my $string = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3443 my $qual = ord($string)-59;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3444 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3447
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3448 sub extract_corresponding_genomic_sequence_single_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3449 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3450 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3451 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3452
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3453 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3454 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3455 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3456 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3457 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3458 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3459 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3460 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3461 my $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3462 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3463
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3464 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3465 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3466 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3467 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3468 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3469 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3470 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3471
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3472 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3473 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3474 ### + 2 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3475 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3476 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3477 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3478 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3479 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3480 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3481
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3482 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3483 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3484 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3485 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3486 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3487 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3488 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3489
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3491 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3492 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3493 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3494 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3495 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3496 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3497 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3498 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3499 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3500 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3501
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3502 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3504 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3505 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3506 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3507 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3508 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3509
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3510 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3511 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3512 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3513 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3514 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3515 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3517 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3518 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3519 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3520 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3521
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3522 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3523 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3524 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3525 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3526 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3527 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3528 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3529
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3530 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3531 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3532 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3533 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3534 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3535 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3536 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3537 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3538 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3539 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3540 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3541 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3542
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3543 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3544 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3545 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3546 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3547
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3548 ### at this point we can also determine the end position of a read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3549 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3550 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3551
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3552 sub extract_corresponding_genomic_sequence_single_end_pbat {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3553 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3554 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3555 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3556
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3557 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3558 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3559 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3560 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3561 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3562 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3563 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3564 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3565 my $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3566 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3567
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3568 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3569
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3570 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3571 if ($pbat_index == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3572 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3573 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3574 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3575 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3576 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3577
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3578 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3579 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3580 ### + 2 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3581 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3582 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3583 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3584 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3585 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3586 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3587
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3588 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3589 elsif ($pbat_index == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3590 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3591 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3592 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3593 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3594 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3595
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3596 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3597 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3598 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3599 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3600 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3601 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3602 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3603 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3604 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3605 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3606 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3607
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3608 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3609 elsif ($pbat_index == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3610 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3611 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3612 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3613 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3614 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3615
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3616 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3617 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3618 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3619 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3620 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3621 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3622 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3623 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3624 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3625 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3626 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3627
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3628 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3629 elsif ($pbat_index == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3630 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3631 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3632 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3633 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3634 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3635
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3636 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3637 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3638 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3639 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3640 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3641 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3642 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3643 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3644 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3645 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3646 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3647 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3648
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3649 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3650 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3651 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3652 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3653
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3654 ### at this point we can also determine the end position of a read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3655 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3656 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3657
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3658
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3659 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3660 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3661
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3662 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3663 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3664
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3665 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3666 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3667
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3668 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3669 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3670 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3671 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3672 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3673 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3674 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3675 my $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3676
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3677 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3678 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3679
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3680 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3681 my @len = split (/\D+/,$cigar); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3682 my @ops = split (/\d+/,$cigar); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3683 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3684 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3685
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3686 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3687 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3688 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3689 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3690 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3691 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3692 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3693 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3694 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3695 my $indels = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3696
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3697 foreach (0..$#len){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3698 if ($ops[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3699 #extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3700 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3701 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3702 $pos += $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3703 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3704 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3705 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3706 $non_bisulfite_sequence .= 'N' x $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3707 # warn "$non_bisulfite_sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3708 # position doesn't need to be adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3709 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3710 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3711 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3712 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3713 $pos += $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3714 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3715 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3716 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3717 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3718 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3719 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3720 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3721 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3722 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3723
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3724 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3725 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3726 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3727 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3728 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3729 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3730 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3731 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3732 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3733 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3734
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3735
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3736
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3737 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3738 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3739 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3740 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3741 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3742 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3743 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3744 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3745
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3746 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3747 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3748 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3749 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3750 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3751 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3752 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3753
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3754 ### reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3755 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3756 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3757
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3758 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3759 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3760 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3761 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3762 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3763 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3764 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3765
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3766 ### reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3767 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3768 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3769
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3770 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3771 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3772 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3773 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3774 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3775 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3776 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3777
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3778 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3779 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3780 die "Too many Bowtie 2 result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3782
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3783 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3784 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3785 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3786 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3787
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3788 ### the end position of a read is stored in $pos
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3789 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3790 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3791 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3792
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3793 ### METHYLATION CALL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3794
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3795 sub methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3796 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3797 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3798 my @seq = split(//,$sequence_actually_observed);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3799 my @genomic = split(//,$genomic_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3800 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3801 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3802 ### CpG, CHH or CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3803
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3804 #################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3805 ### . for bases not involving cytosines ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3806 ### X for methylated C in CHG context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3807 ### x for not methylated C in CHG context (was converted) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3808 ### H for methylated C in CHH context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3809 ### h for not methylated C in CHH context (was converted) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3810 ### Z for methylated C in CpG context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3811 ### z for not methylated C in CpG context (was converted) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3812 #################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3813
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3814 my @match =();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3815 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3816 my $methyl_CHH_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3817 my $methyl_CHG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3818 my $methyl_CpG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3819 my $unmethylated_CHH_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3820 my $unmethylated_CHG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3821 my $unmethylated_CpG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3822
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3823 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3824 for my $index (0..$#seq) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3825 if ($seq[$index] eq $genomic[$index]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3826 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3827 if ($genomic[$index] eq 'C') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3828 ### If the residue is a C we want to know if it was in CpG context or in any other context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3829 my $downstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3830
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3831 if ($downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3832 ++$methyl_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3833 push @match,'Z'; # protected C, methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3834 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3835
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3836 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3837 ### C in not in CpG-context, determining the second downstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3838 my $second_downstream_base = $genomic[$index+2];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3839
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3840 if ($second_downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3841 ++$methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3842 push @match,'X'; # protected C, methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3844 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3845 ++$methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3846 push @match,'H'; # protected C, methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3847 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3848 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3849 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3850 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3851 push @match, '.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3852 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3853 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3854 elsif ($seq[$index] ne $genomic[$index]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3855 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3856 ### in the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3857 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3858 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3859 my $downstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3860
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3861 if ($downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3862 ++$unmethylated_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3863 push @match,'z'; # converted C, not methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3864 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3865
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3866 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3867 ### C in not in CpG-context, determining the second downstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3868 my $second_downstream_base = $genomic[$index+2];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3869
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3870 if ($second_downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3871 ++$unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3872 push @match,'x'; # converted C, not methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3873 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3874 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3875 ++$unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3876 push @match,'h'; # converted C, not methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3877 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3878 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3879 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3880 ### all other mismatches are not of interest for a methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3881 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3882 push @match,'.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3883 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3884 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3885 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3886 die "There can be only 2 possibilities\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3887 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3888 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3889 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3890 elsif ($read_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3891 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3892
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3893 for my $index (0..$#seq) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3894 if ($seq[$index] eq $genomic[$index+2]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3895 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3896 if ($genomic[$index+2] eq 'G') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3897 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3898 ### to look if the base upstream is a C
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3900 my $upstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3901
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3902 if ($upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3903 ++$methyl_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3904 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3905 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3906
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3907 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3908 ### C in not in CpG-context, determining the second upstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3909 my $second_upstream_base = $genomic[$index];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3910
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3911 if ($second_upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3912 ++$methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3913 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3915 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3916 ++$methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3917 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3918 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3919 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3920 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3921 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3922 push @match, '.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3923 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3925 elsif ($seq[$index] ne $genomic[$index+2]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3926 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3927 ### on the opposing strand, so G to A conversions in the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3928 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3929 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3930 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3931
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3932 my $upstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3933
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3934 if ($upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3935 ++$unmethylated_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3936 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3937 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3938
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3939 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3940 ### C in not in CpG-context, determining the second upstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3941 my $second_upstream_base = $genomic[$index];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3942
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3943 if ($second_upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3944 ++$unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3945 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3946 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3947 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3948 ++$unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3949 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3950 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3951 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3952 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3953 ### all other mismatches are not of interest for a methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3954 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3955 push @match,'.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3956 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3957 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3958 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3959 die "There can be only 2 possibilities\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3960 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3961 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3962 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3963 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3964 die "Strand conversion info is required to perform a methylation call\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3965 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3966
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3967 my $methylation_call = join ("",@match);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3968
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3969 $counting{total_meCHH_count} += $methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3970 $counting{total_meCHG_count} += $methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3971 $counting{total_meCpG_count} += $methyl_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3972 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3973 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3974 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3975
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3976 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3977 return $methylation_call;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3978 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3979
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3980 sub read_genome_into_memory{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3981 ## working directoy
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3982 my $cwd = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3983 ## reading in and storing the specified genome in the %chromosomes hash
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3984 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3985 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3986
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3987 my @chromosome_filenames = <*.fa>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3988
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3989 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3990 unless (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3991 @chromosome_filenames = <*.fasta>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3992 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3993
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3994 unless (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3995 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3996 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3997
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3998 foreach my $chromosome_filename (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3999
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4000 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4001 ### first line needs to be a fastA header
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4002 my $first_line = <CHR_IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4003 chomp $first_line;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4004 $first_line =~ s/\r//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4005
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4006 ### Extracting chromosome name from the FastA header
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4007 my $chromosome_name = extract_chromosome_name($first_line);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4008
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4009 my $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4010 while (<CHR_IN>){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4011 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4012 $_ =~ s/\r//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4013 if ($_ =~ /^>/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4014 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4015 if (exists $chromosomes{$chromosome_name}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4016 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4017 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4018 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4019 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4020 if (length($sequence) == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4021 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4022 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4023 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4024 $chromosomes{$chromosome_name} = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4025 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4026 ### resetting the sequence variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4027 $sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4028 ### setting new chromosome name
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4029 $chromosome_name = extract_chromosome_name($_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4030 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4031 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4032 $sequence .= uc$_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4033 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4034 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4035
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4036 if (exists $chromosomes{$chromosome_name}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4037 print "chr $chromosome_name (",length $sequence ," bp)\t";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4038 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4039 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4040 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4041 if (length($sequence) == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4042 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4043 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4044 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4045 $chromosomes{$chromosome_name} = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4046 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4047 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4048 print "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4049 chdir $cwd or die "Failed to move to directory $cwd\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4051
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4052 sub extract_chromosome_name {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4053 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4054 my $fasta_header = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4055 if ($fasta_header =~ s/^>//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4056 my ($chromosome_name) = split (/\s+/,$fasta_header);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4057 return $chromosome_name;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4058 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4059 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4060 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4061 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4062 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4063
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4064 sub reverse_complement{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4065 my $sequence = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4066 $sequence =~ tr/CATG/GTAC/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4067 $sequence = reverse($sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4068 return $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4069 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4070
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4071 sub biTransformFastAFiles {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4072 my $file = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4073 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4074 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4075 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4076 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4077 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4078 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4079 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4080
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4081 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4082 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4083 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4084 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4085 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4086 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4087 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4088
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4089 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4090 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4091 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4092 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4093 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4094 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4095 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4096 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4097
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4098 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4099
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4100 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4101 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4102 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4104 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4105 $C_to_T_infile =~ s/$/_C_to_T.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4106 $G_to_A_infile =~ s/$/_G_to_A.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4109 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4110
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4111 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4112 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4113 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4114 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4115 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4116 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4117
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4118 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4119 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4120 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4121 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4122 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4123 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4124 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4126 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4127
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4128 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4129
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4130 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4131 my $header = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4132 my $sequence= <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4133 last unless ($header and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4135 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4136
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4137 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4138
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4139 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4140 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4141 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4142 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4143 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4144 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4145
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4146 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4147
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4148 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4149 if (index($header,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4150 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4151 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4152
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4153 ### small check if the sequence seems to be in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4154 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4156 my $sequence_C_to_T = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4157 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4158 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4159
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4160 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4161 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4162 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4163 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4164 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4165 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4166 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4167
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4168 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4169 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4170 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4171 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4172 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4173 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4174 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4175 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4176 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4177
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4178 sub biTransformFastAFiles_paired_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4179 my ($file,$read_number) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4180
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4181 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4182 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4183 sleep (2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4184 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4185
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4186 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4187 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4188 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4189 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4190 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4191 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4193
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4194 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4195 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4196 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4197 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4198 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4199 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4200 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4201
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4202 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4203 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4204 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4205 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4206 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4207 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4208 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4210
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4211 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4212 $C_to_T_infile =~ s/$/_C_to_T.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4213 $G_to_A_infile =~ s/$/_G_to_A.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4214
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4215 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4216 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4217 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4218 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4219 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4220 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4221 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4222 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4223 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4224 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4225 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4226 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4228 else{ # all four strand output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4229 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4230 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4231 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4232 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4234
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4235 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4236
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4237 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4238 my $header = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4239 my $sequence= <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4240 last unless ($header and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4241
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4242 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4243
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4244 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4245
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4246 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4247 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4248 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4249 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4250 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4251 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4252
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4253 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4254
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4255 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4256 if (index($header,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4257 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4258 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4259
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4260 ## small check if the sequence seems to be in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4261 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4262
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4263 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4264 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4265 $header =~ s/$/\/1\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4266 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4267 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4268 $header =~ s/$/\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4270 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4271 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4272 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4273 $header =~ s/$/\/2\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4274 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4275 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4276 $header =~ s/$/\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4277 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4278 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4279 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4280 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4281 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4282 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4283
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4284 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4285 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4287 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4288
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4289 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4290 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4291 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4292 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4293 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4294 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4295 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4296 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4297 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4298 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4300 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4301
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4302 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4303 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4304 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4306 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4307 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4308 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4309 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4310 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4311 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4312 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4313
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4314 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4315 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4316 return ($C_to_T_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4317 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4318 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4319 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4320 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4321 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4322 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4323 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4324 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4325 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4326
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4327
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4328 sub biTransformFastQFiles {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4329 my $file = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4330 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4331 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4332 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4333 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4334 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4335 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4336 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4337
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4338 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4339 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4340 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4341 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4342 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4343 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4344 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4345
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4346 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4347 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4348 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4349 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4350 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4351 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4352 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4353 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4354
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4355 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4356
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4357 if ($pbat){ # PBAT-Seq
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4358 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4359 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4360 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4361 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4362 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4363 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4364
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4365 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4366
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4367 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4368 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4369 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4370 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4371 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4372 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4374 else{ # directional or non-directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4375 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4376 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4377 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4378 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4379 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4380 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4381
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4382 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4383
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4384 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4385 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4386 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4387 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4388 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4389 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4390
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4391 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4392 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4393 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4394 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4395 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4396 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4397 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4398
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4399 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4401 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4402 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4403 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4404 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4405 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4406 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4407 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4409
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4410 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4411 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4412 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4413 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4414 my $identifier2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4415 my $quality_score = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4416 last unless ($identifier and $sequence and $identifier2 and $quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4417
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4418 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4419
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4420 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4421
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4422 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4423 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4424 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4425 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4426 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4427 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4428
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4429 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4430
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4431 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4432 if (index($identifier,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4433 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4434 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4435
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4436 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4437 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4438 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4439 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4440 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4443 if ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4444 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4445 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4446 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4447 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4448 else{ # directional or non-directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4449 my $sequence_C_to_T = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4450 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4451 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4452
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4453 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4454 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4455 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4456 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4457 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4458 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4459 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4460
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4461 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4462 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4463 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4464 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4465 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4466 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4467 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4468 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4469 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4470 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4471 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4472 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4473 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4474 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4475
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4476 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4477 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4478
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4479 sub biTransformFastQFiles_paired_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4480 my ($file,$read_number) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4481 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4482
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4483 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4484 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4485 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4486 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4487 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4488 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4489
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4490 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4491 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4492 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4493 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4494 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4495 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4496 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4497
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4498 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4499 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4500 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4501 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4502 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4503 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4504 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4505 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4506
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4507 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4508
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4509 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4510 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4511 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4512 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4513 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4514 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4515 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4517
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4518 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4519 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4520 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4521 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4522 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4523 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4524 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4525 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4526 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4527 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4528 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4529 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4530 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4531 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4532 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4533 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4534 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4535 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4536 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4537 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4538 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4541 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4542 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4543 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4544 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4545 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4546 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4547 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4548 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4549 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4550 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4551 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4552 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4553
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4554 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4555 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4556 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4557 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4558 my $identifier2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4559 my $quality_score = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4560 last unless ($identifier and $sequence and $identifier2 and $quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4561 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4562
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4563 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4564
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4565 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4566 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4567 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4568 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4569 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4570 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4571
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4572 $sequence= uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4573
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4574 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4575 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4576 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4577 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4578 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4579 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4580 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4581
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4582 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4583 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4584 $identifier =~ s/$/\/1\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4585 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4586 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4587 $identifier =~ s/$/\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4588 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4589 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4590 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4591 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4592 $identifier =~ s/$/\/2\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4593 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4594 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4595 $identifier =~ s/$/\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4596 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4597 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4598 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4599 die "Read number needs to be 1 or 2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4600 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4601
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4602 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4603 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4604
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4605 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4606 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4607 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4608 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4609 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4610 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4611 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4612 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4613 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4614 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4615 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4616 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4617 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4618
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4619 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4620 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4621 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4622 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4623 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4624 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4625 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4626 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4627 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4628 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4629 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4630 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4631 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4632 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4633 return ($C_to_T_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4634 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4635 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4636 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4637 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4639 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4640 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4641 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4642 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4643 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4644 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4645 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4646
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4647
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4648 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4649
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4650 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4651 my ($file_1,$file_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4652 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4653
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4654 if ($file_1 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4655 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4656 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4657 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4658 $filename = $file_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4659 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4660
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4661 ### gzipped version of infile 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4662 if ($file_1 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4663 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4664 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4665 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4666 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4667 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4668 ### gzipped version of infile 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4669 if ($file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4670 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4671 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4672 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4673 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4674 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4675
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4676
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4677 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4678 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4679 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4680 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4681 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4682 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4683 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4684 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4685
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4686 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4688 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4689 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4690
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4691 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4692 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4693 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4694
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4695 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4696 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4697 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4698 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4699
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4700 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4701 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4702
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4703 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4704 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4705 my $identifier_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4706 my $sequence_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4707 my $identifier2_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4708 my $quality_score_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4709
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4710 my $identifier_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4711 my $sequence_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4712 my $identifier2_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4713 my $quality_score_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4714
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4715 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4716
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4717 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4718
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4719 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4720 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4721 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4722 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4723 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4724 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4725 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4726 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4727 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4728
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4729 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4730 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4731 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4732 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4733 chomp $quality_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4734 chomp $quality_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4735
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4736 $identifier_1 =~ s/^\@//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4737 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4738
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4739 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4740 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4741 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4742 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4743 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4744 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4745
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4746 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4747 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4748
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4749 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4750 my $sequence_1_C_to_T = $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4751 my $sequence_2_G_to_A = $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4752 $sequence_1_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4753 $sequence_2_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4754
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4755 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4756
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4757 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4758 my $sequence_1_G_to_A = $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4759 my $sequence_2_C_to_T = $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4760 $sequence_1_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4761 $sequence_2_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4762 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4763 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4764 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4765
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4766 close CTPLUSGA or die "Couldn't close filehandle\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4767 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4768
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4769 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4770 warn "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4771 return ($CT_plus_GA_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4772 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4773 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4774 close GAPLUSCT or die "Couldn't close filehandle\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4775 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4776 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4777 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4778 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4779
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4780
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4781 sub fix_IDs{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4782 my $id = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4783 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4784 return $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4785 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4786
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4787 sub ensure_sensical_alignment_orientation_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4788 my $index = shift; # index number if the sequence produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4789 my $strand = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4790 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4791 my $orientation = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4792 ##############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4793 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4794 ## here we only want reads in the forward (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4795 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4796 ### if the alignment is (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4797 if ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4798 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4799 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4800 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4801 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4802 ### if the orientation equals (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4803 elsif ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4804 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4805 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4806 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4807 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4808 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4809 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4810 ## here we only want reads in the forward (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4811 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4812 ### if the alignment is (-) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4813 if ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4814 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4815 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4816 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4817 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4818 ### if the orientation equals (+) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4819 elsif ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4820 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4821 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4822 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4823 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4824 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4825 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4826 ## here we only want reads in the forward (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4827 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4828 ### if the alignment is (-) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4829 if ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4830 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4831 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4832 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4833 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4834 ### if the orientation equals (+) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4835 elsif ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4836 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4837 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4838 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4839 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4840 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4841 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4842 ## here we only want reads in the forward (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4843 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4844 ### if the alignment is (+) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4845 if ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4846 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4847 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4848 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4849 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4850 ### if the orientation equals (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4851 elsif ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4852 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4853 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4854 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4855 } else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4856 die "One of the above conditions must be true\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4857 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4858 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4860 sub ensure_sensical_alignment_orientation_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4861 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4862 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4863 my $orientation = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4864 ##############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4865 ## [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4866 ## CT converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4867 ## GA converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4868 ## CT converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4869 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4870 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4871 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4872 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4873 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4874 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4875 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4876 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4877 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4878 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4879 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4880 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4881 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4882 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4883 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4884 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4885 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4886 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4887 ## [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4888 ## GA converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4889 ## CT converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4890 ## GA converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4891 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4892 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4893 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4894 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4895 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4896 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4897 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4898 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4899 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4900 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4901 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4902 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4903 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4904 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4905 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4906 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4907 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4908 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4909 ## [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4910 ## GA converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4911 ## CT converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4912 ## CT converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4913 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4914 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4915 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4916 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4917 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4918 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4919 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4920 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4921 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4922 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4923 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4924 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4925 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4926 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4927 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4929 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4930 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4931 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4932 ## CT converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4933 ## GA converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4934 ## GA converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4935 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4936 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4937 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4938 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4939 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4940 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4941 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4942 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4943 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4944 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4945 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4946 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4947 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4948 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4949 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4950 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4951 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4952 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4953 die "One of the above conditions must be true\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4954 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4955 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4956
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4957 #####################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4958
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4959 ### Bowtie 1 (default) | PAIRED-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4960
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4961 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4962
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4963 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4964
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4965 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4966 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4967 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4968 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4969 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4970 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4971
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4972 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4973 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4974 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4975 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4976 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4977 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4978 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4979 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4980
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4981 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4982
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4983 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4984 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4985 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4986 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4987 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4988 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4989 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4990 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4991
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4992 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4993 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4994 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4995 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4996 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4997 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4998 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4999
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5000 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5001 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5002
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5003 my $line_1 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5004 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5005
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5006 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5007 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5008 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5009 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5010 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5011 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5012
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5013 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5014 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5015
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5016 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5017 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5018 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5019 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5020 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5021 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5022 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5023 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5024 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5025
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5026 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5027 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5028 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5029 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5030 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5031 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5032 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5033 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5034 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5035 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5036 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5037 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5038 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5039
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5040 ### Bowtie 2 | PAIRED-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5041
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5042 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5043 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5044 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5045 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5046 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5047 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5048 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5049 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5050
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5051 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5052 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5053 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5054 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5055 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5056 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5057 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5058 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5059
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5060 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5061
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5062 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5063 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5064 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5065 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5066 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5067 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5068 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5069 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5070
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5071 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5072 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5073 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5074 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5075 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5076 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5077 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5078
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5079 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5080 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5081
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5082 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5083 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5084 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5085 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5086 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5087 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5088 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5089 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5090 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5091 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5092
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5093 my $line_1 = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5094 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5095
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5096 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5097 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5098 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5099 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5100 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5101 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5102
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5103 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5104 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5105
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5106 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5107 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5108 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5109 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5110 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5111 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5112 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5113 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5114 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5115
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5116 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5117 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5118 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5119 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5120 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5121 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5122 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5123 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5124 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5125 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5126 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5127 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5128 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5129
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5130 ### Bowtie 1 (default) | PAIRED-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5131
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5132 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5133 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5135 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5136 warn "Input file is $C_to_T_infile_1 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5137 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5138 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5139 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5140 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5141 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5142 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5143 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5144
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5145 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5146 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5147 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5148 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5149 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5150 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5151 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5152 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5153
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5154 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5156 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5157 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5158 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5159 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5160 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5161 next; # skipping unwanted filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5162 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5163 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5164
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5165 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5166 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5167 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5168 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5169 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5170 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5171 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5172
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5173 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5174 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5175 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5176 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5177 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5178 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5179 sleep(5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5180 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5181 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5182
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5183 my $line_1 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5184 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5185
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5186 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5187 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5188 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5189 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5190 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5191 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5192
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5193 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5194 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5195
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5196 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5197 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5198 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5199 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5200 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5201 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5202 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5203 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5205
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5206 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5207 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5208 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5210
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5211 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5212 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5213 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5214 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5215 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5216 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5217 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5218 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5219 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5221 ### Bowtie 2 | PAIRED-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5222
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5223 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5224 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5225 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5226 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5228 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5229 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5230 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5231
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5232 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5233 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5234 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5235 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5236 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5237 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5238 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5239 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5240
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5241 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5242
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5243 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5244 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5245 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5246 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5247 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5248 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5249 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5250 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5251
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5252 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5253 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5254 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5255 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5256 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5257 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5258 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5259
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5260 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5261 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5262
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5263 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5264 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5265 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5266 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5267 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5268 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5269 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5270 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5271 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5272 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5273
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5274 my $line_1 = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5275 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5276
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5277 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5278 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5279 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5280 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5281 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5282 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5283
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5284 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5285 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5287 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5288 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5290 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5291 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5292 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5293 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5294 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5295 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5296
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5297 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5298 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5299 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5300 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5301
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5302 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5303 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5304 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5305 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5306 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5307 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5308 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5309 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5310 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5311
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5312 #####################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5313
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5314 ### Bowtie 1 (default) | SINGLE-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5315 sub single_end_align_fragments_to_bisulfite_genome_fastA {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5316 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5317 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5318 warn "Input file is $C_to_T_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5319 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5320 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5321 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5322 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5323
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5324 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5325 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5326 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5327 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5328 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5329 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5330 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5331 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5332
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5333 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5334
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5335 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5336 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5337 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5338 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5339 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5340 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5341 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5342
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5343 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5344 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5345 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5346 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5347 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5348 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5349 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5350
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5351 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5352 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5353 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5354 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5355 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5356 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5357 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5358 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5359 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5360 # otherwise we just initialise last_seq_id and last_line as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5361 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5362 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5363 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5364 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5365 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5366 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5367 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5368
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5369 ### Bowtie 2 | SINGLE-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5370 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5371 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5372 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5373 warn "Input file is $C_to_T_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5374 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5375 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5376 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5377 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5378
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5379 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5380 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5381 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5382 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5383 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5384 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5385 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5386 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5387
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5388 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5389
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5390 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5391 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5392 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5393 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5394 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5395 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5396 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5397
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5398 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5399 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5401 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5402 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5403 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5404 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5405 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5406 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5407 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5408 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5409 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5410 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5411
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5412 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5413 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5414 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5415 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5416 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5417 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5418 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5419 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5420 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5421 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5422 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5423 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5424 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5426 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5427 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5428
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5430 ### Bowtie 1 (default) | SINGLE-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5431 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5432 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5433 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5434 warn "Input file is $C_to_T_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5435 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5436 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5437 warn "Input file is $G_to_A_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5439 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5440 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5443
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5444 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5445 ## the data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5446 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5447 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5448 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5449 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5450 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5451 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5452
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5453 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5454 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5455 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5456 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5457 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5458 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5459 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5460 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5461
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5462 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5463 sleep (5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5464
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5465 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5466 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5467 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5468 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5469 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5471
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5472 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5473 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5474 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5475 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5476 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5477 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5478 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5479 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5480 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5481 # otherwise we just initialise last_seq_id and last_line as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5482 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5483 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5484 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5485 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5486 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5487 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5488 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5489
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5490 ### Bowtie 2 | SINGLE-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5491 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5492
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5493 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5494 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5495 warn "Input file is $C_to_T_infile (FastQ)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5496 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5497 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5498 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5499 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5500
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5501 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5502 ## the data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5503 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5504 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5505 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5506 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5507 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5508 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5509 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5510 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5511 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5512 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5513 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5514 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5515 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5517 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5518 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5519
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5520 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5521 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5522 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5523 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5524 # warn "$_\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5525 # sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5526 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5527 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5528 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5529 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5530 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5531 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5532 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5533
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5534 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5535 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5536 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5537 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5538 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5539 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5540 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5541 # warn "storing $id and\n$_\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5542 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5543 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5544 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5545 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5546 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5547 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5548 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5549 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5550 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5551
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5552 ###########################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5553
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5554 sub reset_counters_and_fhs{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5555 my $filename = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5556 %counting=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5557 total_meCHH_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5558 total_meCHG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5559 total_meCpG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5560 total_unmethylated_CHH_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5561 total_unmethylated_CHG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5562 total_unmethylated_CpG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5563 sequences_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5564 no_single_alignment_found => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5565 unsuitable_sequence_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5566 genomic_sequence_could_not_be_extracted_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5567 unique_best_alignment_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5568 low_complexity_alignments_overruled_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5569 CT_CT_count => 0, #(CT read/CT genome, original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5570 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5571 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5572 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5573 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5574 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5575 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5576 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5577 alignments_rejected_count => 0, # only relevant if --directional was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5578 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5579
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5580 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5581 if ($filename =~ ','){ # paired-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5582 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5583 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5584 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5585 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5586 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5587 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5588 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5589 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5590 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5591 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5592 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5593 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5594 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5595 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5596 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5597 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5598 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5599 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5600 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5601 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5602 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5603 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5604 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5605 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5606 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5607 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5608 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5609 else{ # single-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5610 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5611 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5612 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5613 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5614 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5615 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5616 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5617 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5618 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5619 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5620 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5621 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5622 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5623 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5624 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5625 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5626 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5627 if ($filename =~ ','){ # paired-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5628 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5629 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5630 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5631 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5632 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5633 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5634 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5635 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5636 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5637 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5638 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5639 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5640 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5641 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5642 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5643 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5644 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5645 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5646 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5647 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5648 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5649 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5650 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5651 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5652 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5653 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5654 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5655 else{ # single-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5656 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5657 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5658 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5659 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5660 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5661 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5662 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5663 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5664 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5665 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5666 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5667 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5668 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5669 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5670 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5671 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5672 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5673 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5674 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5675 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5676 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5677 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5678 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5679 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5680 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5681 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5682 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5683 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5684 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5685 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5686 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5687 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5688 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5689 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5690 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5691 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5692 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5693 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5694 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5695 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5696 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5697 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5698 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5699 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5700 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5701
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5702
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5703 sub process_command_line{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5704 my @bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5705 my $help;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5706 my $mates1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5707 my $mates2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5708 my $path_to_bowtie;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5709 my $fastq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5710 my $fasta;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5711 my $skip;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5712 my $qupto;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5713 my $phred64;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5714 my $phred33;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5715 my $solexa;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5716 my $mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5717 my $seed_length;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5718 my $best;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5719 my $sequence_format;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5720 my $version;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5721 my $quiet;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5722 my $chunk;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5723 my $non_directional;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5724 my $ceiling;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5725 my $maxins;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5726 my $minins;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5727 my $unmapped;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5728 my $multi_map;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5729 my $output_dir;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5730 my $bowtie2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5731 my $vanilla;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5732 my $sam_no_hd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5733 my $seed_extension_fails;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5734 my $reseed_repetitive_seeds;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5735 my $most_valid_alignments;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5736 my $score_min;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5737 my $parallel;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5738 my $temp_dir;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5739 my $rdg;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5740 my $rfg;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5741 my $non_bs_mm;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5742 my $samtools_path;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5743 my $bam;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5744 my $gzip;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5745 my $pbat;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5746
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5747 my $command_line = GetOptions ('help|man' => \$help,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5748 '1=s' => \$mates1,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5749 '2=s' => \$mates2,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5750 'path_to_bowtie=s' => \$path_to_bowtie,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5751 'f|fasta' => \$fasta,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5752 'q|fastq' => \$fastq,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5753 's|skip=i' => \$skip,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5754 'u|upto=i' => \$qupto,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5755 'phred33-quals' => \$phred33,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5756 'phred64-quals|solexa1' => \$phred64,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5757 'solexa-quals' => \$solexa,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5758 'n|seedmms=i' => \$mismatches,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5759 'l|seedlen=i' => \$seed_length,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5760 'no_best' => \$best,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5761 'version' => \$version,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5762 'quiet' => \$quiet,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5763 'chunkmbs=i' => \$chunk,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5764 'non_directional' => \$non_directional,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5765 'I|minins=i' => \$minins,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5766 'X|maxins=i' => \$maxins,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5767 'e|maqerr=i' => \$ceiling,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5768 'un|unmapped' => \$unmapped,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5769 'ambiguous' => \$multi_map,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5770 'o|output_dir=s' => \$output_dir,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5771 'bowtie2' => \$bowtie2,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5772 'vanilla' => \$vanilla,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5773 'sam-no-hd' => \$sam_no_hd,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5774 'D=i' => \$seed_extension_fails,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5775 'R=i' => \$reseed_repetitive_seeds,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5776 'score_min=s' => \$score_min,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5777 'most_valid_alignments=i' => \$most_valid_alignments,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5778 'p=i' => \$parallel,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5779 'temp_dir=s' => \$temp_dir,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5780 'rdg=s' => \$rdg,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5781 'rfg=s' => \$rfg,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5782 'non_bs_mm' => \$non_bs_mm,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5783 'samtools_path=s' => \$samtools_path,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5784 'bam' => \$bam,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5785 'gzip' => \$gzip,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5786 'pbat' => \$pbat,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5787 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5788
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5789
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5790 ### EXIT ON ERROR if there were errors with any of the supplied options
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5791 unless ($command_line){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5792 die "Please respecify command line options\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5793 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5794 ### HELPFILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5795 if ($help){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5796 print_helpfile();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5797 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5798 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5799 if ($version){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5800 print << "VERSION";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5801
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5802
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5803 Bismark - Bisulfite Mapper and Methylation Caller.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5804
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5805 Bismark Version: $bismark_version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5806 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5807 www.bioinformatics.babraham.ac.uk/projects/
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5808
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5809
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5810 VERSION
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5811 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5812 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5813
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5814
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5815 ##########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5816 ### PROCESSING OPTIONS ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5817 ##########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5818
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5819 unless ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5820 $bowtie2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5821 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5822 unless ($sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5823 $sam_no_hd =0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5824 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5825
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5826 ### PATH TO BOWTIE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5827 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5828 if ($path_to_bowtie){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5829 unless ($path_to_bowtie =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5830 $path_to_bowtie =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5831 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5832 if (-d $path_to_bowtie){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5833 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5834 $path_to_bowtie = "${path_to_bowtie}bowtie2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5835 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5836 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5837 $path_to_bowtie = "${path_to_bowtie}bowtie";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5838 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5839 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5840 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5841 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5842 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5844 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5845 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5846 $path_to_bowtie = 'bowtie2';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5847 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5848 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5849 $path_to_bowtie = 'bowtie';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5850 warn "Path to Bowtie specified as: $path_to_bowtie\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5851 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5852 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5853
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5854 ### OUTPUT REQUESTED AS BAM FILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5855 if ($bam){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5856 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5857 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5858 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5860 ### PATH TO SAMTOOLS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5861 if (defined $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5862 # if Samtools was specified as full command
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5863 if ($samtools_path =~ /samtools$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5864 if (-e $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5865 # Samtools executable found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5866 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5867 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5868 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5869 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5871 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5872 unless ($samtools_path =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5873 $samtools_path =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5874 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5875 $samtools_path .= 'samtools';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5876 if (-e $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5877 # Samtools executable found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5878 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5879 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5880 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5881 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5882 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5883
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5884 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5885 $bam = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5886 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5887 # Check whether Samtools is in the PATH if no path was supplied by the user
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5888 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5889 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5890 $samtools_path = `which samtools`;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5891 chomp $samtools_path;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5892 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5893 $bam = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5894 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5895 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5896
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5897 unless (defined $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5898 $bam = 2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5899 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5900 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5901 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5902 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5903
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5904
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5905 ####################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5906 ### PROCESSING ARGUMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5907
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5908 ### GENOME FOLDER
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5909 my $genome_folder = shift @ARGV; # mandatory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5910 unless ($genome_folder){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5911 warn "Genome folder was not specified!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5912 print_helpfile();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5913 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5915
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5916 ### checking that the genome folder, all subfolders and the required bowtie index files exist
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5917 unless ($genome_folder =~/\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5918 $genome_folder =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5919 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5920
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5921 if (chdir $genome_folder){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5922 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5923 unless ($absolute_genome_folder =~/\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5924 $absolute_genome_folder =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5925 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5926 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5927 $genome_folder = $absolute_genome_folder;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5929 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5930 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5931 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5932
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5933 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5934 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5935
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5936 if ($bowtie2){ ### Bowtie 2 (new)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5937 ### checking the integrity of $CT_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5938 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5939 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5940 foreach my $file(@CT_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5941 unless (-f $file){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5942 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5943 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5944 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5945 ### checking the integrity of $GA_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5946 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5947 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5948 foreach my $file(@GA_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5949 unless (-f $file){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5950 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5951 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5952 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5953 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5954
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5955 else{ ### Bowtie 1 (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5956 ### checking the integrity of $CT_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5957 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5958 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5959 foreach my $file(@CT_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5960 unless (-f $file){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5961 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5962 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5963 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5964 ### checking the integrity of $GA_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5965 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5966 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5967 foreach my $file(@GA_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5968 unless (-f $file){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5969 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5970 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5971 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5972 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5973
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5974 my $CT_index_basename = "${CT_dir}BS_CT";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5975 my $GA_index_basename = "${GA_dir}BS_GA";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5976
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5977 ### INPUT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5978
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5979 ### SEQUENCE FILE FORMAT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5980 ### exits if both fastA and FastQ were specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5981 if ($fasta and $fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5982 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5983 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5984
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5985 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5986 if ($fasta){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5987 print "FastA format specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5988 $sequence_format = 'FASTA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5989 push @bowtie_options, '-f';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5990 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5991 elsif ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5992 print "FastQ format specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5993 $sequence_format = 'FASTQ';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5994 push @bowtie_options, '-q';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5995 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5996 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5997 $fastq = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5998 print "FastQ format assumed (by default)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5999 $sequence_format = 'FASTQ';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6000 push @bowtie_options, '-q';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6001 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6002
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6003 ### SKIP
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6004 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6005 warn "Skipping the first $skip reads from the input file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6006 # push @bowtie_options,"-s $skip";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6007 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6008
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6009 ### UPTO
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6010 if ($qupto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6011 warn "Processing sequences up to read no. $qupto from the input file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6012 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6013 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6014 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6015 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6016 # push @bowtie_options,"--qupto $qupto";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6017 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6018 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6019
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6020 ### QUALITY VALUES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6021 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6022 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6023 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6024 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6025 # Phred quality values work only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6026 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6027 die "Phred quality values works only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6028 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6029 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6030 push @bowtie_options,"--phred33";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6031 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6032 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6033 push @bowtie_options,"--phred33-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6034 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6035 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6036 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6037 # Phred quality values work only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6038 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6039 die "Phred quality values work only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6040 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6041 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6042 push @bowtie_options,"--phred64";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6043 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6044 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6045 push @bowtie_options,"--phred64-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6046 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6047 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6048 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6049 $phred64 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6051
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6052 if ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6053 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6054 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6055 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6056 # Solexa to Phred value conversion works only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6057 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6058 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6059 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6060 push @bowtie_options,"--solexa-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6061 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6062 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6063 $solexa = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6064 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6065
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6066 ### ALIGNMENT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6067
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6068 ### MISMATCHES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6069 if (defined $mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6070 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6071 if ($mismatches == 0 or $mismatches == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6072 push @bowtie_options,"-N $mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6073 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6074 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6075 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6076 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6077 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6078 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6079 if ($mismatches >= 0 and $mismatches <= 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6080 push @bowtie_options,"-n $mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6081 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6082 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6083 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6084 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6085 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6086 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6087 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6088 unless ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6089 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6090 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6091 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6092
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6093 ### SEED LENGTH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6094 if (defined $seed_length){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6095 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6096 push @bowtie_options,"-L $seed_length";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6097 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6098 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6099 push @bowtie_options,"-l $seed_length";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6100 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6101 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6102
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6103 ### MISMATCH CEILING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6104 if (defined $ceiling){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6105 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6106 push @bowtie_options,"-e $ceiling";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6109
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6110 ### BOWTIE 2 EFFORT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6111
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6112 ### CONSECUTIVE SEED EXTENSION FAILS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6113 if (defined $seed_extension_fails){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6114 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6115 push @bowtie_options,"-D $seed_extension_fails";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6116 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6117
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6118 ### RE-SEEDING REPETITIVE SEEDS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6119 if (defined $reseed_repetitive_seeds){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6120 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6121 push @bowtie_options,"-R $reseed_repetitive_seeds";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6122 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6123
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6124
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6125 ### BOWTIE 2 SCORING OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6126 if ($score_min){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6127 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6128 unless ($score_min =~ /^L,.+,.+$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6129 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6130 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6131 push @bowtie_options,"--score-min $score_min";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6132 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6133 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6134 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6135 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6136 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6137 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6138
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6139 ### BOWTIE 2 READ GAP OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6140 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6141
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6142 if ($rdg){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6143 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6144 if ($rdg =~ /^(\d+),(\d+)$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6145 $deletion_open = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6146 $deletion_extend = $2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6147 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6148 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6149 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6150 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6151 push @bowtie_options,"--rdg $rdg";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6152 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6153 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6154 $deletion_open = 5;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6155 $deletion_extend = 3;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6156 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6157
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6158 ### BOWTIE 2 REFERENCE GAP OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6159 if ($rfg){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6160 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6161 if ($rfg =~ /^(\d+),(\d+)$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6162 $insertion_open = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6163 $insertion_extend = $2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6164 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6165 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6166 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6167 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6168 push @bowtie_options,"--rfg $rfg";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6169 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6170 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6171 $insertion_open = 5;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6172 $insertion_extend = 3;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6173 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6174
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6175
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6176 ### BOWTIE 2 PARALLELIZATION OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6177 if (defined $parallel){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6178 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6179 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6180 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6181 if ($parallel){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6182 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6183 push @bowtie_options,"-p $parallel";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6184 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6185 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6186 sleep (2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6187 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6188 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6189
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6190 ### REPORTING OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6191
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6192 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6193 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6194
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6195 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6196 if(defined $most_valid_alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6197
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6198 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6199 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6200 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6201 # else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6202 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6203 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6205 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6206 push @bowtie_options,'-k 2';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6208
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6209 ### --BEST
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6210 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6211 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6212 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6213 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6214 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6215 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6216 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6217 unless ($best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6218 push @bowtie_options,'--best';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6219 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6220 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6221
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6222 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6223 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6224 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6225 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6226 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6228 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6229 $vanilla = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6230 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6231
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6232 ### PAIRED-END MAPPING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6233 if ($mates1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6234 my @mates1 = (split (/,/,$mates1));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6235 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6236 my @mates2 = (split(/,/,$mates2));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6237 unless (scalar @mates1 == scalar @mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6238 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6239 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6240 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6241 my $mate1 = shift @mates1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6242 my $mate2 = shift @mates2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6243 last unless ($mate1 and $mate2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6244 push @filenames,"$mate1,$mate2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6245 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6246 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6247 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6248 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6249 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6250 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6251 elsif ($mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6252 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6253 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6254
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6255 ### SINGLE-END MAPPING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6256 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6257 my $singles;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6258 unless ($mates1 and $mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6259 $singles = join (',',@ARGV);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6260 unless ($singles){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6261 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6262 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6263 $singles =~ s/\s/,/g;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6264 @filenames = (split(/,/,$singles));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6265 warn "\nFiles to be analysed:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6266 warn "@filenames\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6267 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6268 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6269
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6270 ### MININUM INSERT SIZE (PAIRED-END ONLY)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6271 if (defined $minins){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6272 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6273 push @bowtie_options,"--minins $minins";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6274 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6275
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6276 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6277 if (defined $maxins){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6278 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6279 push @bowtie_options,"--maxins $maxins";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6281 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6282 unless ($singles){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6283 push @bowtie_options,'--maxins 500';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6284 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6285 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6287 ### QUIET prints nothing besides alignments (suppresses warnings)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6288 if ($quiet){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6289 push @bowtie_options,'--quiet';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6290 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6291
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6292 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6293 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6294 if (defined $chunk){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6295 push @bowtie_options,"--chunkmbs $chunk";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6296 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6297 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6298 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6300 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6301
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6302
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6303 ### SUMMARY OF ALL BOWTIE OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6304 my $bowtie_options = join (' ',@bowtie_options);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6305
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6307 ### STRAND-SPECIFIC LIBRARIES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6308 my $directional;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6309 if ($non_directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6310 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6311 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6312 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6313 $directional = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6314 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6315 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6316 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6317 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6318 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6319
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6320 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6321 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6322 $directional = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6323 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6324 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6325 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6326 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6327 $directional = 1; # default behaviour
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6328 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6329
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6330 ### UNMAPPED SEQUENCE OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6331 $unmapped = 0 unless ($unmapped);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6332
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6333 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6334 $multi_map = 0 unless ($multi_map);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6335
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6336
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6337 ### OUTPUT DIRECTORY
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6338
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6339 chdir $parent_dir or die "Failed to move back to current working directory\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6340 if ($output_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6341 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6342 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6343 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6344
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6345 if (chdir $output_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6346 $output_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6347 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6348 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6349 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6350 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6351 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6352 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6353 warn "Created output directory $output_dir!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6354 chdir $output_dir or die "Failed to move to $output_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6355 $output_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6356 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6357 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6358 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6359 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6360 warn "Output will be written into the directory: $output_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6361 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6362 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6363 $output_dir = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6364 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6365
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6366 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6367
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6368 chdir $parent_dir or die "Failed to move back to current working directory\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6369 if ($temp_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6370 warn "\nUsing temp directory: $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6371 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6372 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6374
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6375 if (chdir $temp_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6376 $temp_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6377 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6378 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6380 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6381 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6382 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6383 warn "Created temporary directory $temp_dir!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6384 chdir $temp_dir or die "Failed to move to $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6385 $temp_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6386 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6387 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6388 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6389 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6390 warn "Temporary files will be written into the directory: $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6391 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6392 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6393 $temp_dir = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6394 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6395
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6396 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6397 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6398 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6399 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6400 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6401 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6402
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6403 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6404 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6405
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6406
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6407
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6408 sub generate_SAM_header{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6409 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6410 foreach my $chr (keys %chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6411 my $length = length ($chromosomes{$chr});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6412 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6413 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6414 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6415 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6416
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6417 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6418 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6419
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6420 sub single_end_SAM_output{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6421 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6422 my $strand = $methylation_call_params->{$id}->{alignment_strand};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6423 my $chr = $methylation_call_params->{$id}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6424 my $start = $methylation_call_params->{$id}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6425 my $stop = $methylation_call_params->{$id}->{end_position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6426 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6427 my $methcall = $methylation_call_params->{$id}->{methylation_call};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6428 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6429 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6430 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6431 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6432 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6433 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6434 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6435 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6436 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6437
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6438 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6439 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6440 ## Bit Description Comment Value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6441 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6442 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6443 ## 0x4 segment unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6444 ## 0x8 next segment in the template unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6445 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6446 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6447 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6448 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6449 ## 0x100 secondary alignment --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6450 ## 0x200 not passing quality controls --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6451 ## 0x400 PCR or optical duplicate --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6452
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6453 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6454
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6455 my $flag; # FLAG variable used for SAM format.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6456 if ($strand eq "+"){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6457 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6458 $flag = 0; # 0 for "+" strand (OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6459 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6460 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6461 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6462 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6463 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6464 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6465 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6466 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6467 elsif ($strand eq "-"){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6468 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6469 $flag = 16; # 16 for "-" strand (OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6471 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6472 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6473 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6474 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6475 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6476 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6477 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6478 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6479 die "Unexpected strand information: $strand\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6480 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6481
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6482 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6483
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6484 my $mapq = 255; # Assume mapping quality is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6485
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6486 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6487
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6488 my $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6489 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6490 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6491 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6492 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6493 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6494 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6495
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6496 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6497
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6498 my $rnext = "*"; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6499
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6500 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6501
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6502 my $pnext = 0; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6503
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6504 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6505
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6506 my $tlen = 0; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6507
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6508 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6509
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6510 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6511 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6512 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6513 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6514 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6516
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6517 if ($strand eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6518 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6519 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6520 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6521 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6522
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6523 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6524
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6525 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6526 # into the reference string. hemming_dist()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6527 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6528 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6529 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6530
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6531 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6532
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6533 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6534
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6535 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6536
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6537 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6538
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6539 my $XM_tag; # Optional tag XM: Methylation Call String
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6540 if ($strand eq '+'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6541 $XM_tag = "XM:Z:$methcall";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6542 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6543 elsif ($strand eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6544 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6545 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6546
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6547 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6548
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6549 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6551 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6552
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6553 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6554
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6555 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6556
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6557 # Optionally calculating number of mismatches for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6558
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6559 if ($non_bs_mm) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6560 if ($bowtie2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6561
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6562 $number_of_mismatches =~ s/-//; # removing the minus sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6563
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6564 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6565 if ($cigar =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6566 # warn "$cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6567
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6568 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6569 my @len = split (/\D+/,$cigar); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6570 my @ops = split (/\d+/,$cigar); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6571 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6573
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6574 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6575 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6576 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6577 next; # irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6578 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6579 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6580 $number_of_mismatches -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6581 $number_of_mismatches -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6582 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6583 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6584 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6585 $number_of_mismatches -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6586 $number_of_mismatches -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6587 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6588 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6589 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6590 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6592 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6593 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6594 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6595 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6596 # warn "Alignment score $number_of_mismatches\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6597 # print "Mismatches $number_of_mismatches\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6598 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6599 ### Now we have InDel corrected alignment scores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6600
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6601 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6602 ### sequence contained more than 5 Ns, but this should occur close to never
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6603
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6604 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6605 # warn "N count: $seq_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6606 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6607 # warn "MM $number_of_mismatches\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6608 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6609 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6610
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6611 ####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6612
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6613 my $XA_tag = "XA:Z:$number_of_mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6614
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6615 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6616
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6617 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6618 ### optionally print number of non-bisulfite mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6619 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6620 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6621 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6622 else{ # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6623 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6624 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6625 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6626 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6627
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6628 sub paired_end_SAM_output{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6629 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6630 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6631 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6632 my $chr = $methylation_call_params->{$id}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6633 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6634 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6635 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6636 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6637 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6638 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6639 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6640
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6641 my $id_1 = $id.'/1';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6642 my $id_2 = $id.'/2';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6643
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6644 # Allows all degenerate nucleotide sequences in reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6645 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6646 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6647
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6648 my $index; # used to store the srand origin of the alignment in a less convoluted way
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6649
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6650 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6651 $index = 0; ## this is OT (original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6652 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6653 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6654 $index = 1; ## this is CTOB (complementary to OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6655 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6656 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6657 $index = 2; ## this is CTOT (complementary to OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6658 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6659 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6660 $index = 3; ## this is OB (original bottom)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6661 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6662 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6663 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6664 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6665
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6666 my $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6667 my $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6668
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6669 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6670 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6671 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6672 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6673 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6674 if ($index == 2 or $index == 3){ # CTOT or OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6675 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6676 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6678 else{ # if the first read aligned in forward direction it is like for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6679 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6680 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6681 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6682 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6683
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6684
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6685
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6686 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6687 ### first or last position.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6688
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6689 if ($index == 0 or $index == 3){ # OT or OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6690 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6691 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6692 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6693 else{ # CTOT or CTOB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6694 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6695 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6697
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6698 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6699
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6700 my $start_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6701 my $start_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6702 # adjusting end positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6703
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6704 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6705 $start_read_1 = $methylation_call_params->{$id}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6706 $start_read_2 = $methylation_call_params->{$id}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6707 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6708 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6709 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6710 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6711 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6712 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6713 else{ # read 1 is on the - strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6714 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6715 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6716 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6717 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6718
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6719 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6720
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6721 my $end_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6722 my $end_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6723 # adjusting end positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6724
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6725 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6726 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6727 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6728 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6729 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6730 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6731 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6732 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6733 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6734 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6735 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6736 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6737 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6738 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6739
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6740 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6741
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6742 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6743 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6744 ## Bit Description Comment Value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6745 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6746 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6747 ## 0x4 segment unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6748 ## 0x8 next segment in the template unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6749 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6750 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6751 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6752 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6753 ## 0x100 secondary alignment --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6754 ## 0x200 not passing quality controls --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6755 ## 0x400 PCR or optical duplicate --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6756
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6757 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6758
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6759 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6760 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6762 my $flag_1; # FLAG variable used for SAM format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6763 my $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6764
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6765 if ($index == 0){ # OT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6766 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6767 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6768 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6769 elsif ($index == 1){ # CTOB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6770 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6771 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6772 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6773 elsif ($index == 2){ # CTOT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6774 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6775 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6776 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6777 elsif ($index == 3){ # OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6778 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6779 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6780 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6781
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6782 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6783
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6784 my $mapq = 255; # Mapping quality is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6785
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6786 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6787
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6788 my $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6789 my $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6790
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6791 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6792 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6793 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6794 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6795 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6796 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6797 $cigar_2 = length($actual_seq_2) . "M";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6798 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6799
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6800 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6801
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6802 my $rnext = '='; # Chromosome of mate; applies to both reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6803
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6804 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6805
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6806 my $pnext_1 = $start_read_2; # Leftmost position of mate
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6807 my $pnext_2 = $start_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6808
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6809 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6810
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6811 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6812 my $tlen_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6813
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6814 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6815
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6816 if ($start_read_1 <= $start_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6817
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6818 # Read 1 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6819
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6820 if ($end_read_2 >= $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6821
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6822 # -------------------------> read 1 reads overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6823 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6824 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6825 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6826 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6827 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6828 # <----------------------- read 2 read 2 contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6829 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6830 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6831 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6832 # -------------------------> read 1 reads 1 and 2 exactly overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6833 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6834 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6835
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6836 # dovetailing of reads is not enabled for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6837
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6838 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6839 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6840 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6841 elsif ($end_read_2 < $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6842
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6843 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6844 # <----------- read 2 read 2 contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6845 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6846 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6847 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6848 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6849 # <----------- read 2 read 2 contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6850
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6851 # start and end of read 2 are fully contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6852 $tlen_1 = 0; # Set as 0 when the information is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6853 $tlen_2 = 0; # Set as 0 when the information is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6854 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6855
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6856 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6857
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6858 elsif ($start_read_2 < $start_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6860 if ($end_read_1 >= $end_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6861
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6862 # Read 2 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6863
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6864 # -------------------------> read 2 reads overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6865 # <------------------------- read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6866 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6867 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6868 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6869 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6870 # <----------------------- read 1 read 1 contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6871 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6872 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6873
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6874 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6875 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6876 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6877 elsif ($end_read_1 < $end_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6878
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6879 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6880 # <----------- read 1 read 1 contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6881 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6882 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6883 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6884 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6885 # <----------- read 1 read 1 contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6886
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6887 # start and end of read 1 are fully contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6888 $tlen_1 = 0; # Set as 0 when the information is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6889 $tlen_2 = 0; # Set as 0 when the information is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6890 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6891 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6892 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6893
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6894 else{ # Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6895
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6896 if ($end_read_2 >= $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6897 # Read 1 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6898 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6899 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6900 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6901
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6902 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6903 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6904 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6905 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6906 # Read 2 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6907 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6908 # <------------------------- read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6909 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6910
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6911 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6912 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6913 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6915
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6916 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6917
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6918 # adjusting the strand of the sequence before we use them to generate mismatch strings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6919 if ($strand_1 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6920 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6921 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6922 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6923 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6924 if ($strand_2 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6925 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6926 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6927 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6929
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6930 # print "$actual_seq_1\n$ref_seq_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6931 # print "$actual_seq_2\n$ref_seq_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6932
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6933 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6934
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6935 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6936 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6937 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6938 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6939 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6940 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6941 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6942 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6943
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6944 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6945
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6946 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6947 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6948
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6949 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6950
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6951 my $XM_tag_1; # Optional tag XM: Methylation call string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6952 my $XM_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6953
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6954 if ($strand_1 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6955 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6956 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6957 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6958 $XM_tag_1 = "XM:Z:$methcall_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6959 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6960
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6961 if ($strand_2 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6962 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6963 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6964 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6965 $XM_tag_2 = "XM:Z:$methcall_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6966 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6967
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6968 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6969
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6970 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6971 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6972
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6973 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6974
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6975 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6976
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6977 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6978
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6979 # Optionally calculating number of mismatches for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6980
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6981 if ($non_bs_mm) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6982 if ($bowtie2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6983
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6984 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6985 $number_of_mismatches_2 =~ s/-//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6986
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6987 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6988
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6989 ### CIGAR 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6990 if ($cigar_1 =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6991 # warn "$cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6992
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6993 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6994 my @len = split (/\D+/,$cigar_1); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6995 my @ops = split (/\d+/,$cigar_1); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6996 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6997 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6998
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6999 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7000 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7001 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7002 next; # irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7003 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7004 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7005 $number_of_mismatches_1 -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7006 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7007 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7008 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7009 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7010 $number_of_mismatches_1 -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7011 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7012 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7013 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7014 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7015 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7016 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7017 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7018 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7019 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7020 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7021
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7022 # warn "Alignment score $number_of_mismatches_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7023 # print "Mismatches $number_of_mismatches_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7024 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7025
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7026 ### CIGAR 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7027 if ($cigar_2 =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7028 # warn "$cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7029
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7030 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7031 my @len = split (/\D+/,$cigar_2); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7032 my @ops = split (/\d+/,$cigar_2); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7033 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7034 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7035
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7036 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7037 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7038 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7039 next; #irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7040 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7041 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7042 $number_of_mismatches_2 -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7043 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7044 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7045 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7046 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7047 $number_of_mismatches_2 -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7048 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7049 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7051 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7052 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7053 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7054 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7055 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7056 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7057 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7058 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7059
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7060 ### Now we have InDel corrected Alignment scores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7061
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7062 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7063 ### sequence contained more than 5 Ns, but this should occur close to never
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7064
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7065 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7066 my $seq_2_N_count = $number_of_mismatches_2 % 6;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7067 # warn "N count 1: $seq_1_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7068 # warn "N count 2: $seq_2_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7069
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7070 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7071 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7072
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7073 # warn "MM1 $number_of_mismatches_1 \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7074 # warn "MM2 $number_of_mismatches_2 \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7075 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7076 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7077
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7078 ####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7079
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7080 my $XA_tag = "XA:Z:$number_of_mismatches_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7081 my $XB_tag = "XB:Z:$number_of_mismatches_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7082
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7083
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7084 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7085 ### optionally print number of non-bisulfite mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7086 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7087 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7088 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7089 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7090 else{ # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7091 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7092 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7093 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7094 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7095
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7096 sub revcomp{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7097 my $seq = shift or die "Missing seq to reverse complement\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7098 $seq = reverse $seq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7099 $seq =~ tr/ACTGactg/TGACTGAC/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7100 return $seq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7101 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7102
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7103 sub hemming_dist{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7104 my $matches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7105 my @actual_seq = split //,(shift @_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7106 my @ref_seq = split //,(shift @_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7107 foreach (0..$#actual_seq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7108 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7109 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7110 return my $hd = scalar @actual_seq - $matches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7111 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7112
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7113 sub make_mismatch_string{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7114 my $actual_seq = shift or die "Missing actual sequence";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7115 my $ref_seq = shift or die "Missing reference sequence";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7116 my $XX_tag = "XX:Z:";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7117 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7118 my $prev_mm_pos = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7119 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7120 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7121 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7122 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7123 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7124 $prev_mm_pos = pos($tmp); # Position of last mismatch
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7126 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7127 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7128 return $XX_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7129 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7130
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7131
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7132
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7133 sub print_helpfile{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7134 print << "HOW_TO";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7135
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7136
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7137 This program is free software: you can redistribute it and/or modify
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7138 it under the terms of the GNU General Public License as published by
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7139 the Free Software Foundation, either version 3 of the License, or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7140 (at your option) any later version.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7141
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7142 This program is distributed in the hope that it will be useful,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7143 but WITHOUT ANY WARRANTY; without even the implied warranty of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7144 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7145 GNU General Public License for more details.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7146 You should have received a copy of the GNU General Public License
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7147 along with this program. If not, see <http://www.gnu.org/licenses/>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7148
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7149
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7150
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7151 DESCRIPTION
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7152
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7153
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7154 The following is a brief description of command line options and arguments to control the Bismark
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7155 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7156 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7157 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7158 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7159 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7160 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7161 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7162 sequence from the genome and determine if there were any protected C's present or not.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7163
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7164 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7165 re-enabled by using --non_directional.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7166
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7167 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7168 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7169 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7170
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7171
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7172 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7173
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7174
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7175 ARGUMENTS:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7176
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7177 <genome_folder> The path to the folder containing the unmodified reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7178 as well as the subfolders created by the Bismark_Genome_Preparation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7179 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7180 Bismark expects one or more fastA files in this folder (file extension: .fa
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7181 or .fasta). The path can be relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7182
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7183 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7184 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7185 correspond file-for-file and read-for-read with those specified in <mates2>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7186 Reads may be a mix of different lengths. Bismark will produce one mapping result
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7187 and one report file per paired-end input file pair.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7188
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7189 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7190 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7191 correspond file-for-file and read-for-read with those specified in <mates1>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7192 Reads may be a mix of different lengths.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7193
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7194 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7195 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7196 produce one mapping result and one report file per input file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7197
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7198
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7199 OPTIONS:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7200
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7201
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7202 Input:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7203
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7204 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7205 files (usually having extension .fg or .fastq). This is the default. See also
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7206 --solexa-quals.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7207
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7208 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7209 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7210 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7211 the read name and the sequence on a single line (and not spread over several lines).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7212
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7213 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7214
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7215 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7216
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7217 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7218
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7219 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7221 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7222 (which can't). The formula for conversion is:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7223 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7224 is usually the right option for use with (unconverted) reads emitted by the GA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7225 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7226
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7227 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7228 reads emitted by GA Pipeline version 1.3 or later. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7229
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7230 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7231 specified it is assumed that Bowtie (1 or 2) is in the PATH.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7232
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7233
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7234 Alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7235
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7236 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7237 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7238 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7239
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7240 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7241 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7242 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7243
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7244 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7245 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7246 quality values to the nearest 10 and saturates at 30. This value is not relevant for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7247 Bowtie 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7248
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7249 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7250 --best mode. Best-first search must keep track of many paths at once to ensure it is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7251 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7252 memory impact of the descriptors, but they can still grow very large in some cases. If
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7253 you receive an error message saying that chunk memory has been exhausted in --best mode,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7254 try adjusting this parameter up to dedicate more memory to the descriptors. This value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7255 is not relevant for Bowtie 2. Default: 512.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7256
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7257 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7258 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7259 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7260 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7261
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7262 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7263 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7264 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7265 A 61-bp gap would not be valid in that case. Default: 500.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7266
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7267
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7268 Bowtie 1 Reporting:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7269
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7270 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7271 will be used by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7272
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7273 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7274 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7275 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7276 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7277 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7278 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7279 the best alignment). --best mode also removes all strand bias. Note that --best does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7280 affect which alignments are considered "valid" by Bowtie, only which valid alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7281 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7282 Default: on.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7283
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7284 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7285 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7287
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7288 Output:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7289
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7290 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7291 bisulfite strands will be reported. Default: OFF.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7292
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7293 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7294 to the original strands are merely theoretical and should not exist in reality. Specifying directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7295 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7296 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7297 for sprand-specific libraries).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7298
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7299 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7300 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7301 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7302 and OB ones. Use this option only if you are certain that your libraries were constructed following
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7303 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7304 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7305 temporary files only).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7307 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7308 split up into several smaller files to run concurrently and the output files are to be merged.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7309
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7310 --quiet Print nothing besides alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7311
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7312 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7313 of SAM format output.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7314
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7315 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7316 appear as they did in the input, without any translation of quality values that may have
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7317 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7318 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7319 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7320 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7321
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7322 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7323 mismatches or other reads that fail to align uniquely to a file in the output directory.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7324 Written reads will appear as they did in the input, without any of the translation of quality
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7325 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7326 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7327 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7328
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7329 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7330 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7331 to create it first. The path to the output folder can be either relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7332
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7333 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7334 the specified folder does not exist, Bismark will attempt to create it first. The path to the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7335 temporary folder can be either relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7336
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7337 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7338 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7339 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7340 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7341 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7342
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7343 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7344 space. This option is available for most alignment modes but is not available for paired-end FastA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7345 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7346 further testing.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7347
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7348 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7349 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7350 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7351 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7352
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7353 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7354 explicitly if Samtools is in the PATH already.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7355
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7356
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7358 Other:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7359
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7360 -h/--help Displays this help file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7361
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7362 -v/--version Displays version information.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7363
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7364
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7365 BOWTIE 2 SPECIFIC OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7366
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7367 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7368 alignments, i.e. searches for alignments involving all read characters (also called
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7369 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7370 and/or quality trimmed where appropriate. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7371
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7372 Bowtie 2 alignment options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7373
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7374 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7375 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7376 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7377 Bowtie 1 see -n).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7378
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7379 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7380 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7381 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7382 Bowtie 1 see -l).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7383
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7384 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7385 position to be the highest possible, regardless of the actual value. I.e. input is treated
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7386 as though all quality values are high. This is also the default behavior when the input
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7387 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7388
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7389
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7390 Bowtie 2 paired-end options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7391
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7392 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7393 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7394 and on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7395
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7396 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7397 A discordant alignment is an alignment where both mates align uniquely, but that does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7398 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7399 and it is on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7401
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7402 Bowtie 2 effort options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7403
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7404 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7405 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7406 new second-best alignment. Default: 15.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7407
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7408 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7409 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7410 mismatches allowed) at different offsets and searches for more alignments. A read is considered
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7411 to have repetitive seeds if the total number of seed hits divided by the number of seeds
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7412 that aligned at least once is greater than 300. Default: 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7413
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7414 Bowtie 2 parallelization options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7415
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7416
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7417 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7418 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7419 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7420 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7421 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7422 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7423 automatically use the option '--reorder', which guarantees that output SAM records are printed in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7424 an order corresponding to the order of the reads in the original input file, even when -p is set
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7425 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7426 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7427 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7428 correspond to input order in that case.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7430 Bowtie 2 Scoring options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7431
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7432 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7433 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7434 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7435 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7436 L,0,-0.2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7437
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7438 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7439 of <int1> + N * <int2>. Default: 5, 3.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7440
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7441 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7442 a penalty of <int1> + N * <int2>. Default: 5, 3.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7443
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7444
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7445 Bowtie 2 Reporting options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7447 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7448 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7449 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7450 effort expended to find valid alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7451
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7452 For reference, this used to be the old (now deprecated) description of -M:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7453 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7454 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7455 happens first. Only the best alignment is reported. Information from the other alignments is used to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7456 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7457 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7458 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7459 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7460 always used and its default value is set to 10.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7461
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7462
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7463 'VANILLA' Bismark OUTPUT:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7464
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7465 Single-end output format (tab-separated):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7466
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7467 (1) <seq-ID>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7468 (2) <read alignment strand>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7469 (3) <chromosome>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7470 (4) <start position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7471 (5) <end position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7472 (6) <observed bisulfite sequence>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7473 (7) <equivalent genomic sequence>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7474 (8) <methylation call>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7475 (9) <read conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7476 (10) <genome conversion>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7477 (11) <read quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7478
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7479
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7480 Paired-end output format (tab-separated):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7481 (1) <seq-ID>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7482 (2) <read 1 alignment strand>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7483 (3) <chromosome>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7484 (4) <start position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7485 (5) <end position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7486 (6) <observed bisulfite sequence 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7487 (7) <equivalent genomic sequence 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7488 (8) <methylation call 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7489 (9) <observed bisulfite sequence 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7490 (10) <equivalent genomic sequence 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7491 (11) <methylation call 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7492 (12) <read 1 conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7493 (13) <genome conversion>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7494 (14) <read 1 quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7495 (15) <read 2 quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7496
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7497
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7498 Bismark SAM OUTPUT (default):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7499
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7500 (1) QNAME (seq-ID)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7501 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7502 (3) RNAME (chromosome)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7503 (4) POS (start position)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7504 (5) MAPQ (always 255)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7505 (6) CIGAR
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7506 (7) RNEXT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7507 (8) PNEXT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7508 (9) TLEN
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7509 (10) SEQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7510 (11) QUAL (Phred33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7511 (12) NM-tag (edit distance to the reference)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7512 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7513 (14) XM-tag (methylation call string)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7514 (15) XR-tag (read conversion state for the alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7515 (16) XG-tag (genome conversion state for the alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7516 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7517
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7518 Each read of paired-end alignments is written out in a separate line in the above format.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7519
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7520
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7521 Last edited on 10 May 2013.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7522
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7523 HOW_TO
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7524 }