annotate bismark_wrapper/bismark @ 1:183de9d00131 draft

add indices.loc files
author bjoern-gruening
date Tue, 25 Dec 2012 05:52:28 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1 #!/usr/bin/perl --
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2 use strict;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3 use warnings;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4 use IO::Handle;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5 use Cwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6 $|++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
7 use Getopt::Long;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
8
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
9
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
10 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@babraham.ac.uk)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
11
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
15 ## (at your option) any later version.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
16
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
20 ## GNU General Public License for more details.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
21
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
24
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
25
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
26 my $parent_dir = getcwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
27 my $bismark_version = 'v0.7.7';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
29
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
31 foreach my $arg (@ARGV){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
33 $arg = '--phred64-quals';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
34 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
35 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
37
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
39
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
42 my %counting; # counting various events
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
43
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
44 my $seqID_contains_tabs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
45
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
46 foreach my $filename (@filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
47
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
49 ### resetting the counting hash and fhs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
50 reset_counters_and_fhs($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
51 $seqID_contains_tabs = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
52
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
54 if ($filename =~ ','){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
56
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
61
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
62 print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
63
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
65 print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
66
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
67 ### additional variables only for paired-end alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
69
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
70 ### FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
72 print "Input files are in FastA format\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
73
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
74 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
77
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
86 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
87 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
90
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
99 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
100
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
101 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
103 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
104 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
106 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
107 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
108
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
109 ### FastQ format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
110 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
111 print "Input files are in FastQ format\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
112 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
113 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
114 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
115
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
116 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
117 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
118 $fhs[1]->{inputfile_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
119 $fhs[1]->{inputfile_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
120 $fhs[2]->{inputfile_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
121 $fhs[2]->{inputfile_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
122 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
123 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
124 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
125 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
126 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
127 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
128
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
129 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
130 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
131 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
132 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
133 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
134 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
135 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
136 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
137 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
138
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
139 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
140 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
141 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
142 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
143 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
144 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
145 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
146 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
147 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
148
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
149 ### Else we are performing SINGLE-END ALIGNMENTS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
150 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
151 print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
152 ### Initialising bisulfite conversion filenames
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
153 my ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
154
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
155
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
156 ### FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
157 if ($sequence_file_format eq 'FASTA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
158 print "Inut file is in FastA format\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
159 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
160 ($C_to_T_infile) = biTransformFastAFiles ($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
161 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
162 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
163 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
164 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
165 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
166 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
167 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
168
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
169 ### Creating 4 different bowtie filehandles and storing the first entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
170 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
171 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
172 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
173 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
174 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
175 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
176 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
177
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
178 ## FastQ format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
179 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
180 print "Input file is in FastQ format\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
181 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
182 ($C_to_T_infile) = biTransformFastQFiles ($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
183 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
184 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
185 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
186 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
187 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
188 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
189 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
190
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
191 ### Creating 4 different bowtie filehandles and storing the first entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
192 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
193 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
194 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
195 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
196 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
197 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
198 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
199
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
200 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
201
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
202 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
203 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
204
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
205 sub start_methylation_call_procedure_single_ends {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
206 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
207 my ($dir,$filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
208
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
209 if ($sequence_file =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
210 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
211 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
212 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
213 $filename = $sequence_file;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
214 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
215
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
216 ### printing all alignments to a results file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
217 my $outfile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
218
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
219 if ($bowtie2){ # SAM format is the default for Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
220 $outfile =~ s/$/_bt2_bismark.sam/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
221 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
222 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
223 $outfile =~ s/$/_bismark.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
224 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
225 else{ # SAM is the default output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
226 $outfile =~ s/$/_bismark.sam/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
227 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
228 print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
229 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
230 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
231 print OUT "Bismark version: $bismark_version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
232 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
233
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
234 ### printing alignment and methylation call summary to a report file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
235 my $reportfile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
236 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
237 $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
238 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
239 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
240 $reportfile =~ s/$/_Bismark_mapping_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
241 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
242
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
243 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
244 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
245
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
246 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
247 my $unmapped_file = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
248 $unmapped_file =~ s/$/_unmapped_reads.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
249 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
250 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
251 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
252 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
253 my $ambiguous_file = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
254 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
255 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
256 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
257 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
258
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
259 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
260 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
261 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
262 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
263
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
264
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
265 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
266 unless (%chromosomes){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
267 my $cwd = getcwd; # storing the path of the current working directory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
268 print "Current working directory is: $cwd\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
269 read_genome_into_memory($cwd);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
270 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
271
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
272 unless ($vanilla or $sam_no_hd){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
273 generate_SAM_header();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
274 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
275
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
276 ### Input file is in FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
277 if ($sequence_file_format eq 'FASTA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
278 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
279 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
280 ### Input file is in FastQ format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
281 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
282 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
283 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
284 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
285
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
286 sub start_methylation_call_procedure_paired_ends {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
287 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
288
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
289 my ($dir_1,$filename_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
290
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
291 if ($sequence_file_1 =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
292 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
293 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
294 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
295 $filename_1 = $sequence_file_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
296 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
297
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
298 my ($dir_2,$filename_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
299
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
300 if ($sequence_file_2 =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
301 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
302 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
303 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
304 $filename_2 = $sequence_file_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
305 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
306
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
307 ### printing all alignments to a results file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
308 my $outfile = $filename_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
309 if ($bowtie2){ # SAM format is the default Bowtie 2 output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
310 $outfile =~ s/$/_bismark_bt2_pe.sam/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
311 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
312 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
313 $outfile =~ s/$/_bismark_pe.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
314 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
315 else{ # SAM format is the default Bowtie 1 output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
316 $outfile =~ s/$/_bismark_pe.sam/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
317 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
318
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
319 print "Writing bisulfite mapping results to $outfile\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
320 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
321 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
322 print OUT "Bismark version: $bismark_version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
323 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
324
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
325 ### printing alignment and methylation call summary to a report file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
326 my $reportfile = $filename_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
327 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
328 $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
329 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
330 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
331 $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
332 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
333
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
334 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
335 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
336 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
337
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
338
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
339 ### Unmapped read output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
340 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
341 my $unmapped_1 = $filename_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
342 my $unmapped_2 = $filename_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
343 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
344 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
345 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
346 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
347 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
348 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
349
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
350 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
351 my $amb_1 = $filename_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
352 my $amb_2 = $filename_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
353 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
354 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
355 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
356 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
357 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
358 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
359
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
360 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
361 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
362 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
363
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
364 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
365 unless (%chromosomes){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
366 my $cwd = getcwd; # storing the path of the current working directory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
367 print "Current working directory is: $cwd\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
368 read_genome_into_memory($cwd);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
369 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
370
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
371 unless ($vanilla or $sam_no_hd){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
372 generate_SAM_header();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
373 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
374
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
375 ### Input files are in FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
376 if ($sequence_file_format eq 'FASTA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
377 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
378 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
379 ### Input files are in FastQ format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
380 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
381 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
382 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
383 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
384
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
385 sub print_final_analysis_report_single_end{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
386 my ($C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
387 ### All sequences from the original sequence file have been analysed now
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
388 ### deleting temporary C->T or G->A infiles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
389
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
390 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
391 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
392 if ($deletion_successful == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
393 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
394 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
395 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
396 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
397 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
398 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
399
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
400 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
401 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
402 if ($deletion_successful == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
403 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
404 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
405 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
406 warn "Could not delete temporary files properly $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
407 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
408 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
409
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
410 ### printing a final report for the alignment procedure
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
411 print REPORT "Final Alignment report\n",'='x22,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
412 print "Final Alignment report\n",'='x22,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
413 # foreach my $index (0..$#fhs){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
414 # print "$fhs[$index]->{name}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
415 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
416 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
417 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
418
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
419 ### printing a final report for the methylation call procedure
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
420 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
421 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
422 my $percent_alignable_sequences;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
423
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
424 if ($counting{sequences_count} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
425 $percent_alignable_sequences = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
426 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
427 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
428 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
429 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
430
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
431 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
432 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
433
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
434 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
435 ### only calculating the percentage if there were any overruled alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
436 if ($counting{low_complexity_alignments_overruled_count}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
437 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
438 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
439 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
440
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
441 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
442 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
443 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
444 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
445 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
446
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
447 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
448 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
449 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
450 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
451 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
452
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
453 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
454 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
455 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
456 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
457
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
458 ### detailed information about Cs analysed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
459 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
460 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
461 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
462 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
463 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
464 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
465 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
466 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
467 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
468
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
469 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
470 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
471 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
472 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
473 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
474 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
475 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
476 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
477
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
478 my $percent_meCHG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
479 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
480 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
481 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
482
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
483 my $percent_meCHH;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
484 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
485 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
486 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
487
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
488 my $percent_meCpG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
489 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
490 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
491 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
492
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
493 ### printing methylated CpG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
494 if ($percent_meCpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
495 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
496 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
497 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
498 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
499 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
500 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
501 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
502
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
503 ### printing methylated C percentage (CHG context) if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
504 if ($percent_meCHG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
505 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
506 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
507 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
508 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
509 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
510 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
511 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
512
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
513 ### printing methylated C percentage (CHH context) if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
514 if ($percent_meCHH){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
515 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
516 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
517 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
518 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
519 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
520 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
521 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
522
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
523 if ($seqID_contains_tabs){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
524 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
525 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
526 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
527 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
528
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
529 sub print_final_analysis_report_paired_ends{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
530 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
531 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
532 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
534 if ($deletion_successful == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
535 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
536 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
537 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
538 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
539 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
540 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
541 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
542 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
543 if ($deletion_successful == 4){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
544 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
545 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
546 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
547 warn "Could not delete temporary files properly: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
548 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
549 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
550
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
551 ### printing a final report for the alignment procedure
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
552 warn "Final Alignment report\n",'='x22,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
553 print REPORT "Final Alignment report\n",'='x22,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
554 # foreach my $index (0..$#fhs){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
555 # print "$fhs[$index]->{name}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
556 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
557 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
558 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
559
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
560 ### printing a final report for the methylation call procedure
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
561 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
562 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
563
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
564 my $percent_alignable_sequence_pairs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
565 if ($counting{sequences_count} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
566 $percent_alignable_sequence_pairs = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
567 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
568 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
569 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
570 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
571 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
572 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
573
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
574 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
575 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
576 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
577 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
578 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
579
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
580
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
581 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
582 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
583 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
584 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
585 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
586 ### detailed information about Cs analysed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
587
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
588 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
589 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
590 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
591 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
592
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
593 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
594 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
595
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
596 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
597 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
598 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
599 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
600 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
601 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
602 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
603 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
604
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
605 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
606 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
607 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
608 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
609 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
610 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
611 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
612
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
613 my $percent_meCHG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
614 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
615 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
616 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
617
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
618 my $percent_meCHH;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
619 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
620 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
621 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
622
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
623 my $percent_meCpG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
624 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
625 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
626 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
627
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
628 ### printing methylated CpG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
629 if ($percent_meCpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
630 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
631 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
632 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
633 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
634 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
635 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
636 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
637
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
638 ### printing methylated C percentage in CHG context if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
639 if ($percent_meCHG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
640 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
641 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
642 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
643 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
644 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
645 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
646 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
647
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
648 ### printing methylated C percentage in CHH context if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
649 if ($percent_meCHH){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
650 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
651 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
652 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
653 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
654 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
655 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
656 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
657
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
658 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
659
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
660 sub process_single_end_fastA_file_for_methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
661 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
662 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
663 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
664 ### the C->T or G->A version
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
665
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
666 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
667 if ($sequence_file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
668 open (IN,"zcat $sequence_file |") or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
669 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
670 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
671 open (IN,$sequence_file) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
672 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
673
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
674 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
675
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
676 warn "\nReading in the sequence file $sequence_file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
677 while (1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
678 # last if ($counting{sequences_count} > 100);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
679 my $identifier = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
680 my $sequence = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
681 last unless ($identifier and $sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
682
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
683 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
684
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
685 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
686
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
687 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
688 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
689 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
690 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
691 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
692 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
693
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
694 $counting{sequences_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
695 if ($counting{sequences_count}%100000==0) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
696 warn "Processed $counting{sequences_count} sequences so far\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
697 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
698 chomp $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
699 chomp $identifier;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
700
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
701 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
702
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
703 my $return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
704 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
705 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
706 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
707 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
708 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
709 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
710
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
711 unless ($return){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
712 $return = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
713 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
714
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
715 # print the sequence to ambiguous.out if --ambiguous was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
716 if ($ambiguous and $return == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
717 print AMBIG ">$identifier\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
718 print AMBIG "$sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
719 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
720
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
721 # print the sequence to <unmapped.out> file if --un was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
722 elsif ($unmapped and $return == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
723 print UNMAPPED ">$identifier\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
724 print UNMAPPED "$sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
725 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
726 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
727 print "Processed $counting{sequences_count} sequences in total\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
728
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
729 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
730
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
731 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
732
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
733 sub process_single_end_fastQ_file_for_methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
734 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
735 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
736 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
737 ### the C->T or G->A version
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
738
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
739 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
740 if ($sequence_file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
741 open (IN,"zcat $sequence_file |") or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
742 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
743 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
744 open (IN,$sequence_file) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
745 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
746
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
747 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
748
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
749 warn "\nReading in the sequence file $sequence_file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
750 while (1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
751 my $identifier = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
752 my $sequence = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
753 my $identifier_2 = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
754 my $quality_value = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
755 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
756
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
757 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
758
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
759 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
760
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
761 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
762 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
763 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
764 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
765 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
766 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
767
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
768 $counting{sequences_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
769
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
770 if ($counting{sequences_count}%1000000==0) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
771 warn "Processed $counting{sequences_count} sequences so far\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
772 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
773 chomp $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
774 chomp $identifier;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
775 chomp $quality_value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
776
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
777 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
778
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
779 my $return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
780 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
781 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
782 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
783 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
784 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
785 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
786
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
787 unless ($return){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
788 $return = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
789 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
790
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
791 # print the sequence to ambiguous.out if --ambiguous was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
792 if ($ambiguous and $return == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
793 print AMBIG "\@$identifier\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
794 print AMBIG "$sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
795 print AMBIG $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
796 print AMBIG "$quality_value\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
797 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
798
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
799 # print the sequence to <unmapped.out> file if --un was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
800 elsif ($unmapped and $return == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
801 print UNMAPPED "\@$identifier\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
802 print UNMAPPED "$sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
803 print UNMAPPED $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
804 print UNMAPPED "$quality_value\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
805 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
806 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
807 print "Processed $counting{sequences_count} sequences in total\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
808
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
809 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
810
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
811 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
812
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
813 sub process_fastA_files_for_paired_end_methylation_calls{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
814 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
815 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
816 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
817 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
818 ### converted genomes (either the C->T or G->A version)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
819
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
820 ### gzipped version of the infiles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
821 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
822 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
823 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
824 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
825 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
826 open (IN1,$sequence_file_1) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
827 open (IN2,$sequence_file_2) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
828 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
829
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
830 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
831 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
832
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
833 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
834
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
835 while (1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
836 # reading from the first input file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
837 my $identifier_1 = <IN1>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
838 my $sequence_1 = <IN1>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
839 # reading from the second input file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
840 my $identifier_2 = <IN2>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
841 my $sequence_2 = <IN2>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
842 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
843
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
844 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
845 $identifier_2 = fix_IDs($identifier_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
846
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
847 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
848
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
849 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
850 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
851 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
852 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
853 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
854 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
855
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
856 $counting{sequences_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
857 if ($counting{sequences_count}%100000==0) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
858 warn "Processed $counting{sequences_count} sequences so far\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
859 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
860 my $orig_identifier_1 = $identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
861 my $orig_identifier_2 = $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
862
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
863 chomp $sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
864 chomp $identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
865 chomp $sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
866 chomp $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
867
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
868 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
869
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
870 my $return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
871 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
872 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
873 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
874 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
875 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
876 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
877
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
878 unless ($return){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
879 $return = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
880 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
881
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
882 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
883 if ($ambiguous and $return == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
884 print AMBIG_1 $orig_identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
885 print AMBIG_1 "$sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
886 print AMBIG_2 $orig_identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
887 print AMBIG_2 "$sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
888 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
889
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
890 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
891 elsif ($unmapped and $return == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
892 print UNMAPPED_1 $orig_identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
893 print UNMAPPED_1 "$sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
894 print UNMAPPED_2 $orig_identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
895 print UNMAPPED_2 "$sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
896 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
897 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
898
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
899 print "Processed $counting{sequences_count} sequences in total\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
900
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
901 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
902
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
903 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
904
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
905 sub process_fastQ_files_for_paired_end_methylation_calls{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
906 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
907 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
908 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
909 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
910 ### of the converted genomes (either C->T or G->A version)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
911
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
912 ### gzipped version of the infiles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
913 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
914 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
915 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
916 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
917 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
918 open (IN1,$sequence_file_1) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
919 open (IN2,$sequence_file_2) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
920 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
921
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
922 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
923
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
924 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
925 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
926 while (1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
927 # reading from the first input file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
928 my $identifier_1 = <IN1>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
929 my $sequence_1 = <IN1>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
930 my $ident_1 = <IN1>; # not needed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
931 my $quality_value_1 = <IN1>; # not needed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
932 # reading from the second input file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
933 my $identifier_2 = <IN2>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
934 my $sequence_2 = <IN2>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
935 my $ident_2 = <IN2>; # not needed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
936 my $quality_value_2 = <IN2>; # not needed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
937 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
938
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
939 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
940 $identifier_2 = fix_IDs($identifier_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
941
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
942 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
943
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
944 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
945 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
946 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
947 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
948 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
949 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
951 $counting{sequences_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
952 if ($counting{sequences_count}%100000==0) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
953 warn "Processed $counting{sequences_count} sequences so far\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
954 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
955
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
956 my $orig_identifier_1 = $identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
957 my $orig_identifier_2 = $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
958
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
959 chomp $sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
960 chomp $identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
961 chomp $sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
962 chomp $identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
963 chomp $quality_value_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
964 chomp $quality_value_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
965
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
966 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
967
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
968 my $return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
969 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
970 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
971 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
972 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
973 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
974 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
975
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
976 unless ($return){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
977 $return = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
978 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
979
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
980 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
981 if ($ambiguous and $return == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
982 # seq_1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
983 print AMBIG_1 $orig_identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
984 print AMBIG_1 "$sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
985 print AMBIG_1 $ident_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
986 print AMBIG_1 "$quality_value_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
987 # seq_2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
988 print AMBIG_2 $orig_identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
989 print AMBIG_2 "$sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
990 print AMBIG_2 $ident_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
991 print AMBIG_2 "$quality_value_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
992 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
993
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
994 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
995 elsif ($unmapped and $return == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
996 # seq_1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
997 print UNMAPPED_1 $orig_identifier_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
998 print UNMAPPED_1 "$sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
999 print UNMAPPED_1 $ident_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1000 print UNMAPPED_1 "$quality_value_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1001 # seq_2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1002 print UNMAPPED_2 $orig_identifier_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1003 print UNMAPPED_2 "$sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1004 print UNMAPPED_2 $ident_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1005 print UNMAPPED_2 "$quality_value_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1006 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1007 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1008
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1009 print "Processed $counting{sequences_count} sequences in total\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1010
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1011 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1012
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1013 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1014
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1015 sub check_bowtie_results_single_end{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1016 my ($sequence,$identifier,$quality_value) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1017
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1018 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1019 $quality_value = 'I'x(length$sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1020 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1021
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1022 my %mismatches = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1023 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1024 foreach my $index (0..$#fhs){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1025
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1026 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1027 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1028 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1029 if ($fhs[$index]->{last_seq_id} eq $identifier) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1030 ###############################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1031 ### STEP I Now processing the alignment stored in last_line ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1032 ###############################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1033 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1034 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1035 ### we only continue to extract useful information about this alignment if 1 was returned
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1036 if ($valid_alignment_found_1 == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1037 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1038 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1039 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1040
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1041 unless($mismatch_info){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1042 $mismatch_info = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1043 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1044
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1045 chomp $mismatch_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1046 my $chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1047 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1048 $chromosome = $mapped_chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1049 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1050 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1051 die "Chromosome number extraction failed for $mapped_chromosome\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1052 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1053 ### Now extracting the number of mismatches to the converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1054 my $number_of_mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1055 if ($mismatch_info eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1056 $number_of_mismatches = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1057 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1058 elsif ($mismatch_info =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1059 my @mismatches = split (/,/,$mismatch_info);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1060 $number_of_mismatches = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1061 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1062 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1063 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1064 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1065 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1066 my $alignment_location = join (":",$chromosome,$position);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1067 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1068 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1069 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1070 ### number for the found alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1071 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1072 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1073 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1074 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1075 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1076 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1077 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1078 $number_of_mismatches = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1079 ##################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1080 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1081 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1082 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1083 ##################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1084 my $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1085 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1086 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1087 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1088 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1089 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1090 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1091 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1092 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1093 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1094 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1095 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1096 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1097 ### we only continue to extract useful information about this second alignment if 1 was returned
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1098 if ($valid_alignment_found_2 == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1099 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1100 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1101 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1102 unless($mismatch_info){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1103 $mismatch_info = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1104 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1105 chomp $mismatch_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1106
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1107 my $chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1108 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1109 $chromosome = $mapped_chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1110 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1111 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1112 die "Chromosome number extraction failed for $mapped_chromosome\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1113 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1114
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1115 ### Now extracting the number of mismatches to the converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1116 my $number_of_mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1117 if ($mismatch_info eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1118 $number_of_mismatches = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1119 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1120 elsif ($mismatch_info =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1121 my @mismatches = split (/,/,$mismatch_info);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1122 $number_of_mismatches = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1123 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1124 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1125 die "Something weird is going on with the mismatch field\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1126 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1127 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1128 ### extracting the chromosome number from the bowtie output (see above)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1129 my $alignment_location = join (":",$chromosome,$position);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1130 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1131 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1132 ### case we are not writing the same entry out a second time.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1133 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1134 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1135 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1136 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1137 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1138 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1139 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1140 ####################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1141 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1142 ####################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1143 $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1144 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1145 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1146 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1147 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1148 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1149 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1150 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1151 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1152 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1153 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1154 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1155 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1156 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1157 ### still within the 2nd sequence in correct orientation found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1158 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1159 ### still withing the 1st sequence in correct orientation found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1160 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1161 ### still within the if (last_seq_id eq identifier) condition
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1162 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1163 ### still within foreach index loop
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1164 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1165 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1166 unless(%mismatches){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1167 $counting{no_single_alignment_found}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1168 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1169 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1170 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1171 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1172 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1173 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1174 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1175 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1176 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1177 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1178 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1179 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1180 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1181 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1182 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1183 my $sequence_fails = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1184 ### Declaring an empty hash reference which will store all information we need for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1185 my $methylation_call_params; # hash reference!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1186 ### sorting in ascending order
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1187 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1188
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1189 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1190 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1191 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1192 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1193 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1194 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1195 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1196 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1197 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1198 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1199 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1200 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1201 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1202 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1203 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1204 ### reaction. E.g.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1205 ### CAGTCACGCGCGCGCG will become
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1206 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1207 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1208 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1209 ### G->A conversion:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1210 ### highly methylated: CAATCACACACACACA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1211 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1212 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1213 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1214 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1215 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1216 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1217 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1218 ### In the above example the number of transliterations required to transform the actual sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1219 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1220 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1221 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1222 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1223 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1224 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1225 my @three_candidate_seqs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1226 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1227 my $transliterations_performed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1228 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1229 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1230 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1231 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1232 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1233 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1234 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1235 die "unexpected index number range $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1236 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1237 push @three_candidate_seqs,{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1238 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1239 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1240 mismatch_number => $mismatch_number,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1241 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1242 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1243 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1244 transliterations_performed => $transliterations_performed,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1245 };
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1246 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1247 ### sorting in ascending order for the lowest number of transliterations performed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1248 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1249 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1250 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1251 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1252 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1253 if (($first_array_element*2) < $second_array_element){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1254 $counting{low_complexity_alignments_overruled_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1255 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1256 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1257 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1258 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1259 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1260 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1261 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1262 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1263 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1264 $sequence_fails = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1265 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1266 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1267 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1268 $sequence_fails = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1269 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1270 ### after processing the alignment with the lowest number of mismatches we exit
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1271 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1273 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1274 if ($sequence_fails == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1275 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1276 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1277 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1278 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1279 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1280 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1281 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1282 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1283 return 0; # => exits to next sequence (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1284 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1285 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1286
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1287 ### --DIRECTIONAL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1288 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1289 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1290 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1291 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1292 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1293 $counting{alignments_rejected_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1294 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1295 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1296 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1297
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1298 ### If the sequence has not been rejected so far it will have a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1299 $counting{unique_best_alignment_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1300 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1301 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1302 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1303 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1304 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1305 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1306 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1307
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1308 ### otherwise we are set to perform the actual methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1309 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1310
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1311 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1312 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1313 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1314
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1315 sub check_bowtie_results_single_end_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1316 my ($sequence,$identifier,$quality_value) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1317
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1318 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1319 $quality_value = 'I'x(length$sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1320 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1321
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1322 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1323 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1324
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1325 my $alignment_ambiguous = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1326
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1327 my %alignments = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1328
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1329 ### reading from the Bowtie 2 output filehandles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1330 foreach my $index (0..$#fhs){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1331 # print "Index: $index\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1332 # print "$fhs[$index]->{last_line}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1333 # print "$fhs[$index]->{last_seq_id}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1334
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1335 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1336 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1337
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1338 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1339 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1340
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1341 if ($fhs[$index]->{last_seq_id} eq $identifier) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1342
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1343 # SAM format specifications for Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1344 # (1) Name of read that aligned
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1345 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1346 # 1 The read is one of a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1347 # 2 The alignment is one end of a proper paired-end alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1348 # 4 The read has no reported alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1349 # 8 The read is one of a pair and has no reported alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1350 # 16 The alignment is to the reverse reference strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1351 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1352 # 64 The read is mate 1 in a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1353 # 128 The read is mate 2 in a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1354 # 256 The read has multiple mapping states
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1355 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1356 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1357 # (5) Mapping quality (255 means MAPQ is not available)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1358 # (6) CIGAR string representation of alignment (* if unavailable)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1359 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1360 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1361 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1362 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1363 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1364 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1365 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1366 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1367 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1368 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1369 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1370 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1371 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1372 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1373 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1374 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1375
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1376 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1377
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1378 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1379 if ($flag == 4){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1380 ## reading in the next alignment, which must be the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1381 my $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1382 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1383 chomp $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1384 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1385 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1386 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1387 if ($seq_id eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1388 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1389 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1390 next; # next instance
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1391 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1392 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1393 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1394 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1395 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1396 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1397 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1398 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1399
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1400 # if there are one or more proper alignments we can extract the chromosome number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1401 my $chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1402 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1403 $chromosome = $mapped_chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1404 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1405 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1406 die "Chromosome number extraction failed for $mapped_chromosome\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1407 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1408
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1409 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1410 my ($alignment_score,$second_best,$MD_tag);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1411 my @fields = split (/\t/,$fhs[$index]->{last_line});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1412
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1413 foreach (11..$#fields){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1414 if ($fields[$_] =~ /AS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1415 $alignment_score = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1416 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1417 elsif ($fields[$_] =~ /XS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1418 $second_best = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1419 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1420 elsif ($fields[$_] =~ /MD:Z:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1421 $MD_tag = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1422 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1423 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1424
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1425 # warn "First best alignment_score is: '$alignment_score'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1426 # warn "MD tag is: '$MD_tag'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1427 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1428
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1429 if (defined $second_best){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1430 # warn "second best alignment_score is: '$second_best'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1431
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1432 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1433 if ($alignment_score == $second_best){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1434 $alignment_ambiguous = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1435 ## need to read and discard all additional ambiguous reads until we reach the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1436 until ($fhs[$index]->{last_seq_id} ne $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1437 my $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1438 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1439 chomp $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1440 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1441 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1442 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1443 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1444 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1445 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1446 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1447 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1448 last; # break free in case we have reached the end of the alignment output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1449 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1450 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1451 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1452 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1453 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1454
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1455 my $alignment_location = join (":",$chromosome,$position);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1456
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1457 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1458 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1459 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1460 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1461
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1462 unless (exists $alignments{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1463 $alignments{$alignment_location}->{seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1464 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1465 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1466 $alignments{$alignment_location}->{index} = $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1467 $alignments{$alignment_location}->{chromosome} = $chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1468 $alignments{$alignment_location}->{position} = $position;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1469 $alignments{$alignment_location}->{CIGAR} = $cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1470 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1471 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1472
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1473 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1474 until ($fhs[$index]->{last_seq_id} ne $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1475 my $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1476 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1477 chomp $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1478 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1479 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1480 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1481 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1482 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1483 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1484 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1485 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1486 last; # break free in case we have reached the end of the alignment output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1487 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1488 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1489 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1490 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1491 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1492 else{ # there is no second best hit, so we can just store this one and read in the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1493
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1494 my $alignment_location = join (":",$chromosome,$position);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1495
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1496 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1497 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1498 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1499 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1500
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1501 unless (exists $alignments{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1502 $alignments{$alignment_location}->{seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1503 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1504 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1505 $alignments{$alignment_location}->{index} = $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1506 $alignments{$alignment_location}->{chromosome} = $chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1507 $alignments{$alignment_location}->{position} = $position;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1508 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1509 $alignments{$alignment_location}->{CIGAR} = $cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1510 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1511
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1512 my $newline = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1513 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1514 chomp $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1515 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1516 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1517 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1518 if ($seq_id eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1519 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1520 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1521 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1522 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1523 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1524 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1525 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1526 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1527 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1528 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1529 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1530
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1531 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1532 if ($alignment_ambiguous == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1533 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1534 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1535 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1536 # print "$ambiguous_read_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1537
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1538 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1539 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1540 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1541 elsif ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1542 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1543 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1544 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1545 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1546 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1547 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1548
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1549 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1550 unless(%alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1551 $counting{no_single_alignment_found}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1552 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1553 # print "$unmapped_read_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1554 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1555 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1556 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1557 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1558 return 0; # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1559 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1560 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1561
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1562 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1563
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1564 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1565 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1566 ### alignment score we are discarding the sequence altogether.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1567 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1568 ### opening (5) and extending (3 per bp) the gap.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1569
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1570 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1571
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1572 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1573 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1574
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1575 ### print contents of %alignments for debugging
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1576 # if (scalar keys %alignments > 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1577 # print "\n******\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1578 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1579 # print "Loc: $alignment_location\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1580 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1581 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1582 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1583 # print "Index $alignments{$alignment_location}->{index}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1584 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1585 # print "pos: $alignments{$alignment_location}->{position}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1586 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1587 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1588 # print "\n******\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1589 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1590
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1591 ### if there is only 1 entry in the hash with we accept it as the best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1592 if (scalar keys %alignments == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1593 for my $unique_best_alignment (keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1594 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1595 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1596 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1597 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1598 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1599 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1600 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1601 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1602 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1603
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1604 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1605 ### we boot the sequence altogether
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1606 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1607 my $best_alignment_score;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1608 my $best_alignment_location;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1609 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1610 # print "$alignments{$alignment_location}->{alignment_score}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1611 unless (defined $best_alignment_score){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1612 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1613 $best_alignment_location = $alignment_location;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1614 # print "setting best alignment score: $best_alignment_score\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1615 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1616 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1617 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1618 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1619 # warn "Same alignment score, the sequence will get booted!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1620 $sequence_fails = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1621 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1622 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1623 ### else we are going to store the best alignment for further processing
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1624 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1625 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1626 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1627 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1628 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1629 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1630 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1631 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1632 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1633 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1634 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1635 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1636 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1637 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1638 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1639 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1640
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1641 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1642 if ($sequence_fails == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1643 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1644
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1645 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1646 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1647 # print OUT "$ambiguous_read_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1648
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1649 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1650 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1651 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1652 elsif ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1653 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1654 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1655 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1656 return 0; # => exits to next sequence (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1657 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1658 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1659
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1660 ### --DIRECTIONAL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1661 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1662 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1663 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1664 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1665 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1666 $counting{alignments_rejected_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1667 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1668 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1669 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1670
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1671 ### If the sequence has not been rejected so far it has a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1672 $counting{unique_best_alignment_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1673
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1674 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1675 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1676
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1677 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1678 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1679 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1680 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1681 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1682 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1683
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1684
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1685 ### otherwise we are set to perform the actual methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1686 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1687 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1688 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1689 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1690
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1691
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1692 sub determine_number_of_transliterations_performed{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1693 my ($sequence,$read_conversion) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1694 my $number_of_transliterations;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1695 if ($read_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1696 $number_of_transliterations = $sequence =~ tr/C/T/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1697 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1698 elsif ($read_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1699 $number_of_transliterations = $sequence =~ tr/G/A/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1700 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1701 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1702 die "Read conversion mode of the read was not specified $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1703 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1704 return $number_of_transliterations;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1705 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1706
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1707 sub decide_whether_single_end_alignment_is_valid{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1708 my ($index,$identifier) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1709
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1710 # extracting from Bowtie 1 format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1711 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1712
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1713 ### ensuring that the entry is the correct sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1714 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1715 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1716 ### sensible alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1717 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1718 ### If the orientation was correct can we move on
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1719 if ($orientation == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1720 return 1; ### 1st possibility for a sequence to pass
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1721 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1722 ### If the alignment was in the wrong orientation we need to read in a new line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1723 elsif($orientation == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1724 my $newline = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1725 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1726 ($id,$strand) = (split (/\t/,$newline))[0,1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1727
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1728 ### ensuring that the next entry is still the correct sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1729 if ($id eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1730 ### checking orientation again
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1731 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1732 ### If the orientation was correct can we move on
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1733 if ($orientation == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1734 $fhs[$index]->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1735 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1736 return 1; ### 2nd possibility for a sequence to pass
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1737 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1738 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1739 elsif ($orientation == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1740 $newline = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1741 if ($newline){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1742 my ($seq_id) = split (/\t/,$newline);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1743 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1744 ### the same fields of the just read next entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1745 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1746 $fhs[$index]->{last_seq_id} = $seq_id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1747 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1748 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1749 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1750 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1751 # assigning undef to last_seq_id and last_line (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1752 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1753 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1754 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1755 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1756 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1757 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1758 die "The orientation of the alignment must be either correct or incorrect\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1759 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1760 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1761 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1762 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1763 $fhs[$index]->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1764 $fhs[$index]->{last_line} = $newline;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1765 return 0; # processing the new alignment result only in the next round
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1766 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1767 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1768 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1769 # assigning undef to last_seq_id and last_line (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1770 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1771 $fhs[$index]->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1772 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1773 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1774 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1775 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1776 die "The orientation of the alignment must be either correct or incorrect\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1777 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1778 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1779 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1780 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1781 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1782 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1783 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1784 #########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1785 ### BOWTIE 1 | PAIRED-END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1786 #########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1787
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1788 sub check_bowtie_results_paired_ends{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1789 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1790
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1791 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1792 unless ($quality_value_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1793 $quality_value_1 = 'I'x(length$sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1794 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1795 unless ($quality_value_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1796 $quality_value_2 = 'I'x(length$sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1797 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1798
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1799 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1800
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1801 my %mismatches = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1802 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1803
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1804
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1805 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1806 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1807 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1808 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1809 ### strands are not being reported by specifying --directional
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1810
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1811 foreach my $index (0,3,1,2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1812 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1813 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1814 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1815 if ($fhs[$index]->{last_seq_id} eq $identifier) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1816 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1817
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1818 ##################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1819 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1820 ##################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1821 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1822 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1823 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1824 if ($valid_alignment_found == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1825 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1826 ### we store the useful information in %mismatches
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1827 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1828 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1829 chomp $mismatch_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1830 chomp $mismatch_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1831
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1832 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1833 my ($chromosome_1,$chromosome_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1834 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1835 $chromosome_1 = $mapped_chromosome_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1836 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1837 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1838 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1839 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1840 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1841 $chromosome_2 = $mapped_chromosome_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1842 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1843 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1844 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1845 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1846
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1847 ### Now extracting the number of mismatches to the converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1848 my $number_of_mismatches_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1849 my $number_of_mismatches_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1850 if ($mismatch_info_1 eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1851 $number_of_mismatches_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1852 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1853 elsif ($mismatch_info_1 =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1854 my @mismatches = split (/,/,$mismatch_info_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1855 $number_of_mismatches_1 = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1856 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1857 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1858 die "Something weird is going on with the mismatch field\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1859 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1860 if ($mismatch_info_2 eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1861 $number_of_mismatches_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1862 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1863 elsif ($mismatch_info_2 =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1864 my @mismatches = split (/,/,$mismatch_info_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1865 $number_of_mismatches_2 = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1866 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1867 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1868 die "Something weird is going on with the mismatch field\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1869 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1870 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1871 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1872 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1873 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1874 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1875 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1876 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1877 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1878 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1879 ### number for the found alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1880 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1881 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1882 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1883 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1884 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1885 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1886 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1887 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1888 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1889 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1890 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1891 ###################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1892 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1893 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1894 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1895 ### this round ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1896 ###################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1897 my $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1898 my $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1899
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1900 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1901 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1902 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1903
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1904 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1905 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1906 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1907 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1908 $fhs[$index]->{last_seq_id} = $seq_id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1909 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1910 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1911 die "Either read 1 or read 2 needs to end on '/1'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1912 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1913
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1914 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1915 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1916 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1917 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1918 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1919 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1920 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1921 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1922 next; # jumping to the next index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1923 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1924 ### Now processing the entry we just stored in last_line_1 and last_line_2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1925 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1926 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1927 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1928 if ($valid_alignment_found == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1929 ### we store the useful information in %mismatches
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1930 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1931 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1932 chomp $mismatch_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1933 chomp $mismatch_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1934 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1935 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1936 $chromosome_1 = $mapped_chromosome_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1937 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1938 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1939 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1940 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1941 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1942 $chromosome_2 = $mapped_chromosome_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1943 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1944 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1945 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1946 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1947
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1948 $number_of_mismatches_1='';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1949 $number_of_mismatches_2='';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1950 ### Now extracting the number of mismatches to the converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1951 if ($mismatch_info_1 eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1952 $number_of_mismatches_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1953 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1954 elsif ($mismatch_info_1 =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1955 my @mismatches = split (/,/,$mismatch_info_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1956 $number_of_mismatches_1 = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1957 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1958 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1959 die "Something weird is going on with the mismatch field\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1960 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1961 if ($mismatch_info_2 eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1962 $number_of_mismatches_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1963 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1964 elsif ($mismatch_info_2 =~ /^\d/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1965 my @mismatches = split (/,/,$mismatch_info_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1966 $number_of_mismatches_2 = scalar @mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1967 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1968 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1969 die "Something weird is going on with the mismatch field\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1970 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1971 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1972 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1973 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1974 die "position 1 is greater than position 2" if ($position_1 > $position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1975 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1976 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1977 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1978 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1979 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1980 ### number for the found alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1981 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1982 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1983 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1984 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1985 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1986 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1987 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1988 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1989 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1990 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1991 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1992 ###############################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1993 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1994 ###############################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1995 $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1996 $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1997
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1998 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1999 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2000 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2001
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2002 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2003 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2004 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2005 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2006 $fhs[$index]->{last_seq_id} = $seq_id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2007 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2008 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2009 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2010 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2011 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2012 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2013 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2014 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2015 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2016 next; # jumping to the next index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2017 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2018 ### within the 2nd sequence pair alignment in correct orientation found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2019 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2020 ### within the 1st sequence pair alignment in correct orientation found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2021 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2022 ### still within the (last_seq_id eq identifier) condition
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2023 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2024 ### still within foreach index loop
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2025 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2026 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2027 unless(%mismatches){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2028 $counting{no_single_alignment_found}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2029 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2030 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2031 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2032 my $sequence_pair_fails = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2033 ### Declaring an empty hash reference which will store all information we need for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2034 my $methylation_call_params; # hash reference!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2035 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2036 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2037 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2038 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2039 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2040 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2041 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2042 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2043 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2044 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2045 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2046 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2047 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2048 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2049 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2050 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2051 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2052 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2053 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2054 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2055 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2056 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2057 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2058 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2059 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2060 $sequence_pair_fails = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2061 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2062 ### after processing the alignment with the lowest number of mismatches we exit
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2063 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2064 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2065 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2066 if ($sequence_pair_fails == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2067 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2068 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2069 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2070 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2071 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2072 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2073 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2074 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2075 return 0; # => exits to next sequence (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2076 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2077 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2078
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2079 ### --DIRECTIONAL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2080 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2081 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2082 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2083 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2084 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2085 $counting{alignments_rejected_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2086 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2087 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2088 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2089
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2090 ### If the sequence has not been rejected so far it does have a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2091 $counting{unique_best_alignment_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2092 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2093
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2094 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2095 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2096 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2097 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2098 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2099 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2100 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2101 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2102 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2103 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2104 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2105
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2106 ### otherwise we are set to perform the actual methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2107 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2108 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2109
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2110 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2111 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2112 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2113
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2114 #########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2115 ### BOWTIE 2 | PAIRED-END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2116 #########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2117
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2118 sub check_bowtie_results_paired_ends_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2119 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2120
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2121 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2122 unless ($quality_value_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2123 $quality_value_1 = 'I'x(length$sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2124 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2125
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2126 unless ($quality_value_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2127 $quality_value_2 = 'I'x(length$sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2128 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2129
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2130
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2131 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2132
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2133
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2134 my %alignments;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2135 my $alignment_ambiguous = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2136
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2137 ### reading from the Bowtie 2 output filehandles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2138
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2139 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2140 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2141 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2142 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2143 ### strands are not being reported when '--directional' is specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2144
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2145 foreach my $index (0,3,1,2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2146 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2147 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2148
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2149 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2150 if ($fhs[$index]->{last_seq_id} eq $identifier) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2151
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2152 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2153 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2154 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2155 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2156 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2157 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2158 $id_1 =~ s/\/1$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2159 $id_2 =~ s/\/2$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2160
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2161 # SAM format specifications for Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2162 # (1) Name of read that aligned
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2163 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2164 # 1 The read is one of a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2165 # 2 The alignment is one end of a proper paired-end alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2166 # 4 The read has no reported alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2167 # 8 The read is one of a pair and has no reported alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2168 # 16 The alignment is to the reverse reference strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2169 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2170 # 64 The read is mate 1 in a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2171 # 128 The read is mate 2 in a pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2172 # 256 The read has multiple mapping states
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2173 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2174 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2175 # (5) Mapping quality (255 means MAPQ is not available)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2176 # (6) CIGAR string representation of alignment (* if unavailable)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2177 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2178 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2179 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2180 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2181 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2182 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2183 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2184 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2185 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2186 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2187 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2188 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2189 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2190 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2191 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2192 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2193
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2194 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2195 ### We can store the next alignment and move on to the next Bowtie 2 instance
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2196 if ($flag_1 == 77 and $flag_2 == 141){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2197 ## reading in the next alignment, which must be the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2198 my $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2199 my $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2200
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2201 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2202 chomp $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2203 chomp $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2204 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2205 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2206 $seq_id_1 =~ s/\/1$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2207 $seq_id_2 =~ s/\/2$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2208 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2209 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2210 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2211
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2212 # print "current sequence ($identifier) did not map, reading in next sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2213 # print "$index\t$fhs[$index]->{last_seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2214 # print "$index\t$fhs[$index]->{last_line_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2215 # print "$index\t$fhs[$index]->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2216 next; # next instance
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2217 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2218 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2219 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2220 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2221 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2222 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2223 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2224 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2225 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2226
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2227 ### If there are one or more proper alignments we can extract the chromosome number
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2228 my ($chromosome_1,$chromosome_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2229 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2230 $chromosome_1 = $mapped_chromosome_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2231 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2232 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2233 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2234 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2235 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2236 $chromosome_2 = $mapped_chromosome_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2237 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2238 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2239 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2240 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2241
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2242 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2243
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2244 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2245 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2246
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2247 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2248 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2249
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2250 foreach (11..$#fields_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2251 if ($fields_1[$_] =~ /AS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2252 $alignment_score_1 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2253 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2254 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2255 $second_best_1 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2256 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2257 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2258 $MD_tag_1 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2259 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2260 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2261
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2262 foreach (11..$#fields_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2263 if ($fields_2[$_] =~ /AS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2264 $alignment_score_2 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2265 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2266 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2267 $second_best_2 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2268 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2269 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2270 $MD_tag_2 = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2271 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2273
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2274 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2275 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2276
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2277 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2278 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2279 # warn "MD tag 1 is: '$MD_tag_1'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2280 # warn "MD tag 2 is: '$MD_tag_2'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2281
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2282 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2283 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2284 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2285
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2286 if (defined $second_best_1 and defined $second_best_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2287 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2288 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2289 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2290 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2291
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2292 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2293 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2294 $alignment_ambiguous = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2295 # print "This read will be chucked (AS==XS detected)!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2296
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2297 ## need to read and discard all additional ambiguous reads until we reach the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2298 until ($fhs[$index]->{last_seq_id} ne $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2299 my $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2300 my $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2301 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2302 chomp $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2303 chomp $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2304 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2305 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2306 $seq_id_1 =~ s/\/1$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2307 $seq_id_2 =~ s/\/2$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2308 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2309
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2310 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2311 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2312 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2313 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2314 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2315 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2316 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2317 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2318 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2319 last; # break free if the end of the alignment output was reached
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2320 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2321 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2322 # if ($fhs[$index]->{last_seq_id}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2323 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2324 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2325 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2326 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2327
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2328 my $alignment_location;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2329 if ($position_1 <= $position_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2330 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2331 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2332 elsif($position_2 < $position_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2333 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2334 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2335
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2336 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2337 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2338 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2339 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2340
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2341 unless (exists $alignments{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2342 $alignments{$alignment_location}->{seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2343 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2344 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2345 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2346 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2347 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2348 $alignments{$alignment_location}->{index} = $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2349 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2350 $alignments{$alignment_location}->{position_1} = $position_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2351 $alignments{$alignment_location}->{position_2} = $position_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2352 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2353 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2354 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2355 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2356 $alignments{$alignment_location}->{flag_1} = $flag_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2357 $alignments{$alignment_location}->{flag_2} = $flag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2358 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2359 # warn "added best of several alignments to \%alignments hash\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2360
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2361 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2362 until ($fhs[$index]->{last_seq_id} ne $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2363 my $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2364 my $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2365 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2366 chomp $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2367 chomp $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2368 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2369 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2370 $seq_id_1 =~ s/\/1$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2371 $seq_id_2 =~ s/\/2$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2372 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2373
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2374 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2375 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2376 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2377 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2378 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2379 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2380 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2381 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2382 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2383 last; # break free if the end of the alignment output was reached
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2384 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2385 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2386 # if($fhs[$index]->{last_seq_id}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2387 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2388 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2389 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2390 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2391 else{ # there is no second best hit, so we can just store this one and read in the next sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2392
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2393 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2394 # print "$alignment_location\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2395 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2396 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2397 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2398 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2399
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2400 unless (exists $alignments{$alignment_location}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2401 $alignments{$alignment_location}->{seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2402 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2403 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2404 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2405 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2406 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2407 $alignments{$alignment_location}->{index} = $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2408 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2409 $alignments{$alignment_location}->{position_1} = $position_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2410 $alignments{$alignment_location}->{position_2} = $position_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2411 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2412 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2413 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2414 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2415 $alignments{$alignment_location}->{flag_1} = $flag_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2416 $alignments{$alignment_location}->{flag_2} = $flag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2417 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2418
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2419 # warn "added unique alignment to \%alignments hash\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2420
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2421 # Now reading and storing the next read pair
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2422 my $newline_1 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2423 my $newline_2 = $fhs[$index]->{fh}-> getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2424 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2425 chomp $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2426 chomp $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2427 # print "$newline_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2428 # print "$newline_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2429 my ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2430 my ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2431 $seq_id_1 =~ s/\/1$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2432 $seq_id_2 =~ s/\/2$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2433 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2434
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2435 $fhs[$index]->{last_seq_id} = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2436 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2437 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2438
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2439 if ($seq_id_1 eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2440 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2441 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2442 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2443 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2444 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2445 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2446 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2447 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2448 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2449 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2450 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2451 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2452
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2453 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2454 if ($alignment_ambiguous == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2455 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2456 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2457 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2458 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2459 # print "$ambiguous_read_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2460 # print "$ambiguous_read_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2461
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2462 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2463 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2464 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2465 elsif ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2466 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2467 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2468 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2469 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2470 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2471 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2472
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2473 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2474 unless (%alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2475 $counting{no_single_alignment_found}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2476
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2477 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2478 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2479 # print "$unmapped_read_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2480 # print "$unmapped_read_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2481 if ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2482 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2483 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2484 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2485 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2486 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2487 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2488
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2489 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2490
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2491 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2492 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2493 ### alignment score we are discarding the sequence pair altogether.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2494 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2495 ### and extending (3 per bp) the gap.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2496
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2497 #######################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2498
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2499 ### Declaring an empty hash reference which will store all information we need for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2500 my $methylation_call_params; # hash reference
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2501 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2502
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2503 ### print contents of %alignments for debugging
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2504 ## if (scalar keys %alignments >= 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2505 # print "\n******\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2506 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2507 # print "Loc: $alignment_location\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2508 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2509 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2510 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2511 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2512 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2513 # print "Index $alignments{$alignment_location}->{index}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2514 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2515 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2516 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2517 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2518 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2519 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2520 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2521 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2522 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2523 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2524 # print "\n******\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2525 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2526
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2527 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2528 if (scalar keys %alignments == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2529 for my $unique_best_alignment (keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2530 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2531 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2532 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2533 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2534 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2535 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2536 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2537 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2538 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2539 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2540 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2541 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2542 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2543 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2544 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2545 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2546 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2547
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2548 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2549 ### we boot the sequence pair altogether)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2550 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2551 my $best_sum_of_alignment_scores;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2552 my $best_alignment_location;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2553 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2554 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2555 unless (defined $best_sum_of_alignment_scores){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2556 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2557 $best_alignment_location = $alignment_location;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2558 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2559 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2560 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2561 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2562 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2563 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2564 $sequence_pair_fails = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2565 last; # exiting since we know that the sequence has ambiguous alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2566 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2567 ### else we are going to store the best alignment for further processing
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2568 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2569 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2570 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2571 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2572 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2573 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2574 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2575 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2576 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2577 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2578 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2579 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2580 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2581 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2582 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2583 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2584 last; # exiting since the sequence produced a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2585 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2586 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2587 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2588 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2589 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2590 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2591 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2592
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2593 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2594 if ($sequence_pair_fails == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2595 $counting{unsuitable_sequence_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2596
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2597 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2598 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2599 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2600 # print "$ambiguous_read_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2601 # print "$ambiguous_read_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2602
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2603 if ($ambiguous){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2604 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2605 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2606 elsif ($unmapped){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2607 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2608 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2609 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2610 return 0; # => exits to next sequence pair (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2611 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2612 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2613
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2614 ### --DIRECTIONAL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2615 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2616 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2617 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2618 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2619 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2620 $counting{alignments_rejected_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2621 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2622 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2623 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2624
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2625 ### If the sequence pair has not been rejected so far it does have a unique best alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2626 $counting{unique_best_alignment_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2627 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2628
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2629 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2630 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2631 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2632 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2633 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2634 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2635 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2636 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2637 $counting{genomic_sequence_could_not_be_extracted_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2638 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2639 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2640
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2641 ### now we are set to perform the actual methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2642 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2643 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2644 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2645 # print " $sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2646 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2647 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2648
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2649 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2650 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2651 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2652
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2653 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2654
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2655 sub decide_whether_paired_end_alignment_is_valid{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2656 my ($index,$identifier) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2657 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2658 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2659 chomp $mismatch_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2660 chomp $mismatch_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2661 my $seq_id_1 = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2662 my $seq_id_2 = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2663 $seq_id_1 =~ s/\/1$//; # removing the read /1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2664 $seq_id_2 =~ s/\/1$//; # removing the read /1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2665
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2666 ### ensuring that the current entry is the correct sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2667 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2668 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2669 ### sensible alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2670 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2671 ### If the orientation was correct can we move on
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2672 if ($orientation == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2673 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2674 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2675 ### If the alignment was in the wrong orientation we need to read in two new lines
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2676 elsif($orientation == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2677 my $newline_1 = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2678 my $newline_2 = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2679 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2680 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2681 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2682 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2683
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2684 my $seqid;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2685 $seq_id_1 = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2686 $seq_id_2 = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2687 # we need to capture the first read (ending on /1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2688 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2689 $seqid = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2690 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2691 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2692 $seqid = $seq_id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2693 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2694 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2695 die "One of the two reads needs to end on /1!!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2696 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2697
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2698 ### ensuring that the next entry is still the correct sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2699 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2700 ### checking orientation again
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2701 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2702 ### If the orientation was correct can we move on
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2703 if ($orientation == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2704 ### Writing the current sequence to last_line_1 and last_line_2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2705 $fhs[$index]->{last_seq_id} = $seqid;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2706 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2707 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2708 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2709 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2710 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2711 ### the next entry)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2712 elsif ($orientation == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2713 $newline_1 = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2714 $newline_2 = $fhs[$index]->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2715 if ($newline_1 and $newline_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2716 ($seq_id_1) = split (/\t/,$newline_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2717 ($seq_id_2) = split (/\t/,$newline_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2718
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2719 $seqid = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2720 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2721 $seqid = $seq_id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2722 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2723 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2724 $seqid = $seq_id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2725 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2726 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2727 die "One of the two reads needs to end on /1!!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2728 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2729
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2730 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2731 ### the same fields of the just read next entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2732 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2733 $fhs[$index]->{last_seq_id} = $seqid;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2734 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2735 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2736 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2737 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2738 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2739 ### assigning undef to last_seq_id and last_line (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2740 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2741 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2742 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2743 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2744 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2745 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2746 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2747 die "The orientation of the alignment must be either correct or incorrect\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2748 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2749 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2750 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2751 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2752 $fhs[$index]->{last_seq_id} = $seqid;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2753 $fhs[$index]->{last_line_1} = $newline_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2754 $fhs[$index]->{last_line_2} = $newline_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2755 return 0; # processing the new alignment result only in the next round
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2756 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2757 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2758 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2759 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2760 $fhs[$index]->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2761 $fhs[$index]->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2762 $fhs[$index]->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2763 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2764 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2765 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2766 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2767 die "The orientation of the alignment must be either correct or incorrect\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2768 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2769 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2770 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2771 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2772 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2773 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2774 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2775
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2776 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2777
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2778 sub extract_corresponding_genomic_sequence_paired_ends {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2779 my ($sequence_identifier,$methylation_call_params) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2780 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2781 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2782 my $alignment_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2783 my $alignment_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2784 my $read_conversion_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2785 my $read_conversion_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2786 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2787
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2788 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2789 ### if the C happens to be at the first or last position of the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2790 my $non_bisulfite_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2791 my $non_bisulfite_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2792
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2793 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2794 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2795 ### sequences around!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2796 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2797 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2798 ### [Index 0, sequence originated from (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2799 $counting{CT_GA_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2800 $alignment_read_1 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2801 $alignment_read_2 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2802 $read_conversion_info_1 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2803 $read_conversion_info_2 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2804 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2805 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2806 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2807
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2808 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2809
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2810 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2811 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2812 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2813
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2814 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2815 ### the reverse strand sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2816 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2817 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2818 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2819 $non_bisulfite_sequence_2 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2820 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2821 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2822
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2823 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2824 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2825 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2826 $counting{GA_CT_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2827 $alignment_read_1 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2828 $alignment_read_2 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2829 $read_conversion_info_1 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2830 $read_conversion_info_2 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2831 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2832
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2833 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2834 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2835 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2836 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2837 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2838 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2839 $non_bisulfite_sequence_1 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2840 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2841
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2842 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2843 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2844 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2845 ### the reverse strand sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2846 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2847 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2848
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2849 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2850 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2851 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2852 $counting{GA_CT_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2853 $alignment_read_1 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2854 $alignment_read_2 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2855 $read_conversion_info_1 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2856 $read_conversion_info_2 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2857 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2858
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2859 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2860 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2861 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2862 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2863 ### the reverse strand sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2864 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2865
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2866 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2867 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2868 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2869 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2870 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2871 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2872 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2873 $non_bisulfite_sequence_2 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2874 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2875 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2876
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2877 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2878 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2879 ### [Index 3, sequence originated from the (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2880 $counting{CT_GA_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2881 $alignment_read_1 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2882 $alignment_read_2 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2883 $read_conversion_info_1 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2884 $read_conversion_info_2 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2885 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2886
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2887 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2888 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2889 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2890 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2891 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2892 ### the reverse strand sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2893 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2894 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2895 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2896 $non_bisulfite_sequence_1 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2897 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2898
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2899 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2900 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2901 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2902 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2903 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2904 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2905 die "Too many bowtie result filehandles\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2906 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2907 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2908 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2909
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2910 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2911 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2912 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2913 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2914 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2915 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2916 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2917 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2918
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2919 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2920
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2921 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2922 my ($sequence_identifier,$methylation_call_params) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2923 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2924 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2925
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2926 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2927 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2928 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2929 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2930 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2931 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2932 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2933
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2934 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2935 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2936 my $alignment_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2937 my $alignment_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2938 my $read_conversion_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2939 my $read_conversion_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2940 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2941
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2942 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2943 ### if the C happens to be at the last position of the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2944 my $non_bisulfite_sequence_1 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2945 my $non_bisulfite_sequence_2 = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2946
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2947 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2948 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2949 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2951 # parsing CIGAR 1 string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2952 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2953 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2954 shift @ops_1; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2955 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2956 # parsing CIGAR 2 string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2957 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2958 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2959 shift @ops_2; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2960 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2961
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2962 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2963 my $indels_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2964
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2965 ### Extracting read 1 genomic sequence ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2966
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2967 # extracting 2 additional bp at the 5' end (read 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2968 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2969 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2970 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2971 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2972 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2973 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2974 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2975 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2976
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2977 foreach (0..$#len_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2978 if ($ops_1[$_] eq 'M'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2979 # extracting genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2980 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2981 # warn "$non_bisulfite_sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2982 # adjusting position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2983 $pos_1 += $len_1[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2984 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2985 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2986 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2987 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2988 # warn "$non_bisulfite_sequence_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2989 # position doesn't need adjusting
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2990 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2991 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2992 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2993 # we do not add any genomic sequence but only adjust the position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2994 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2995 $pos_1 += $len_1[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2996 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2997 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2998 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2999 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3000 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3001 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3002 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3003 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3004 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3005
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3006 ### 3' end of read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3007 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3008 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3009 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3010 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3011 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3012 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3013 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3014 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3015
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3016
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3017 ### Extracting read 2 genomic sequence ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3018
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3019 ### 5' end of read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3020 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3021 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3022 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3023 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3024 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3025 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3026 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3027 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3028
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3029 foreach (0..$#len_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3030 if ($ops_2[$_] eq 'M'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3031 # extracting genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3032 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3033 # warn "$non_bisulfite_sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3034 # adjusting position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3035 $pos_2 += $len_2[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3036 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3037 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3038 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3039 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3040 # warn "$non_bisulfite_sequence_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3041 # position doesn't need adjusting
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3042 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3043 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3044 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3045 # we do not add any genomic sequence but only adjust the position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3046 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3047 $pos_2 += $len_2[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3048 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3049 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3050 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3051 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3052 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3053 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3054 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3055 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3056 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3057
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3058 ### 3' end of read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3059 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3060 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3061 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3062 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3063 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3064 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3065 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3066 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3067
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3068 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3069 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3070
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3071 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3072 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3073 ### [Index 0, sequence originated from (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3074 $counting{CT_GA_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3075 $alignment_read_1 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3076 $alignment_read_2 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3077 $read_conversion_info_1 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3078 $read_conversion_info_2 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3079 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3080 ### Read 1 is always the forward hit
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3081 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3082 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3083 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3084
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3085 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3086 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3087 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3088 $counting{GA_CT_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3089 $alignment_read_1 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3090 $alignment_read_2 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3091 $read_conversion_info_1 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3092 $read_conversion_info_2 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3093 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3094 ### Read 1 is always the forward hit
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3095 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3096 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3097 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3098
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3099 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3100 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3101 ### [Index 2, sequence originated from the complementary to (converted) top strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3102 $counting{GA_CT_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3103 $alignment_read_1 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3104 $alignment_read_2 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3105 $read_conversion_info_1 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3106 $read_conversion_info_2 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3107 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3108
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3109 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3110 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3111 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3112
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3113 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3114 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3115 ### [Index 3, sequence originated from the (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3116 $counting{CT_GA_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3117 $alignment_read_1 = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3118 $alignment_read_2 = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3119 $read_conversion_info_1 = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3120 $read_conversion_info_2 = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3121 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3122 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3123 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3124 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3125 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3126 die "Too many bowtie result filehandles\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3127 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3128 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3129 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3130
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3131 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3132 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3133 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3134 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3135 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3136 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3137 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3138 ## the end position of a read is stored in $pos
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3139 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3140 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3141 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3142 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3143 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3144
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3145 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3146 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3147 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3148
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3149 sub print_bisulfite_mapping_result_single_end{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3150 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3151
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3152 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3153 if ($phred64){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3154 $quality_value = convert_phred64_quals_to_phred33($quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3155 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3156 elsif ($solexa){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3157 $quality_value = convert_solexa_quals_to_phred33($quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3158 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3159
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3160 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3161 $methylation_call_params->{$identifier}->{position} += 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3162
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3163 ### writing every uniquely mapped read and its methylation call to the output file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3164 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3165 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3166 print OUT "$bowtie1_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3167 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3168 else{ # SAM output, default since Bismark v1.0.0
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3169 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3170 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3171 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3172
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3173 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3174 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3175 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3176
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3177 sub print_bisulfite_mapping_result_single_end_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3178 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3179
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3180 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3181 if ($phred64){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3182 $quality_value = convert_phred64_quals_to_phred33($quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3183 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3184 elsif ($solexa){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3185 $quality_value = convert_solexa_quals_to_phred33($quality_value);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3186 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3187
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3188 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3189 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3190 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3191
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3192 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3193 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3194 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3195
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3196 sub print_bisulfite_mapping_results_paired_ends{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3197 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3198
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3199 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3200 if ($phred64){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3201 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3202 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3203 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3204 elsif ($solexa){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3205 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3206 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3207 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3208
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3209 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3210 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3211
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3212 ### writing every single aligned read and its methylation call to the output file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3213 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3214 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3215 print OUT "$bowtie1_output_paired_end\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3216 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3217 else{ # SAM output, default since Bismark v1.0.0
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3218 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3219 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3220
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3221 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3222
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3223 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3224 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3225 ##########################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3226
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3227 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3228 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3229
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3230 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3231 if ($phred64){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3232 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3233 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3234 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3235 elsif ($solexa){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3236 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3237 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3238 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3239
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3240 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3241 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3242
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3243 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3244
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3245
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3246 sub convert_phred64_quals_to_phred33{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3247
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3248 my $qual = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3249 my @quals = split (//,$qual);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3250 my @new_quals;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3251
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3252 foreach my $index (0..$#quals){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3253 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3254 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3255 $new_quals[$index] = $phred33_quality_string;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3256 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3257
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3258 my $phred33_quality = join ("",@new_quals);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3259 return $phred33_quality;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3260 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3261
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3262 sub convert_solexa_quals_to_phred33{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3263
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3264 my $qual = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3265 my @quals = split (//,$qual);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3266 my @new_quals;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3267
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3268 foreach my $index (0..$#quals){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3269 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3270 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3271 $new_quals[$index] = $phred33_quality_string;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3273
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3274 my $phred33_quality = join ("",@new_quals);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3275 return $phred33_quality;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3276 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3277
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3278 sub convert_phred_score_into_phred33_quality_string{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3279 my $qual = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3280 $qual = chr($qual+33);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3281 return $qual;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3282 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3283
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3284 sub convert_phred64_quality_string_into_phred_score{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3285 my $string = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3286 my $qual = ord($string)-64;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3287 return $qual;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3288 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3289
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3290 sub convert_solexa_pre1_3_quality_string_into_phred_score{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3291 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3292 my $string = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3293 my $qual = ord($string)-59;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3294 return $qual;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3295 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3296
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3297
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3298 sub extract_corresponding_genomic_sequence_single_end {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3299 my ($sequence_identifier,$methylation_call_params) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3300 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3301 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3302
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3303 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3304 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3305 my $alignment_strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3306 my $read_conversion_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3307 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3308 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3309 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3310 ### if the C happens to be at the last position of the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3311 my $non_bisulfite_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3312 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3313
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3314 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3315 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3316 ### [Index 0, sequence originated from (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3317 $counting{CT_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3318 $alignment_strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3319 $read_conversion_info = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3320 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3321
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3322 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3323 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3324 ### + 2 extra base at the 3' end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3325 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3326 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3327 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3328 $non_bisulfite_sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3329 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3330 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3331
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3332 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3333 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3334 ### [Index 1, sequence originated from (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3335 $counting{CT_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3336 $alignment_strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3337 $read_conversion_info = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3338 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3339
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3340 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3341 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3342 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3343 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3344 ## reverse complement!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3345 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3346 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3347 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3348 $non_bisulfite_sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3349 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3350 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3351
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3352 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3353 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3354 ### [Index 2, sequence originated from complementary to (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3355 $counting{GA_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3356 $alignment_strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3357 $read_conversion_info = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3358 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3359
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3360 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3361 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3362 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3363 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3364 ## reverse complement!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3365 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3366 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3367 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3368 $non_bisulfite_sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3369 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3370 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3371
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3372 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3373 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3374 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3375 $counting{GA_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3376 $alignment_strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3377 $read_conversion_info = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3378 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3379
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3380 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3381 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3382 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3383 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3384 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3385 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3386 $non_bisulfite_sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3387 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3388 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3389 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3390 die "Too many bowtie result filehandles\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3391 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3392
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3393 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3394 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3395 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3396 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3397
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3398 ### at this point we can also determine the end position of a read
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3399 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3400 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3401
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3402
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3403 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3404 my ($sequence_identifier,$methylation_call_params) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3405
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3406 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3407 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3408
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3409 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3410 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3411
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3412 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3413 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3414 my $alignment_strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3415 my $read_conversion_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3416 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3417 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3418 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3419 my $non_bisulfite_sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3420
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3421 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3422 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3423
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3424 # parsing CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3425 my @len = split (/\D+/,$cigar); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3426 my @ops = split (/\d+/,$cigar); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3427 shift @ops; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3428 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3429
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3430 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3431 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3432 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3433 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3434 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3435 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3436 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3437 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3438 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3439 my $indels = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3440
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3441 foreach (0..$#len){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3442 if ($ops[$_] eq 'M'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3443 #extracting genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3444 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3445 # adjusting position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3446 $pos += $len[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3447 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3448 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3449 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3450 $non_bisulfite_sequence .= 'N' x $len[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3451 # warn "$non_bisulfite_sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3452 # position doesn't need to be adjusting
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3453 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3454 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3455 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3456 # we do not add any genomic sequence but only adjust the position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3457 $pos += $len[$_];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3458 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3459 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3460 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3461 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3462 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3463 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3464 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3465 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3466 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3467
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3468 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3469 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3470 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3471 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3472 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3473 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3474 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3475 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3476 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3477 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3478
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3479
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3480
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3481 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3482 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3483 ### [Index 0, sequence originated from (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3484 $counting{CT_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3485 $alignment_strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3486 $read_conversion_info = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3487 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3488 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3489
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3490 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3492 ### [Index 1, sequence originated from (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3493 $counting{CT_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3494 $alignment_strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3495 $read_conversion_info = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3496 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3497
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3498 ### reverse complement!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3499 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3500 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3501
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3502 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3504 ### [Index 2, sequence originated from complementary to (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3505 $counting{GA_CT_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3506 $alignment_strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3507 $read_conversion_info = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3508 $genome_conversion = 'CT';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3509
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3510 ### reverse complement!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3511 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3512 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3513
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3514 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3515 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3516 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3517 $counting{GA_GA_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3518 $alignment_strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3519 $read_conversion_info = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3520 $genome_conversion = 'GA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3521
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3522 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3523 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3524 die "Too many Bowtie 2 result filehandles\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3525 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3526
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3527 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3528 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3529 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3530 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3531
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3532 ### the end position of a read is stored in $pos
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3533 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3534 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3535 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3536
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3537 ### METHYLATION CALL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3538
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3539 sub methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3540 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3541 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3542 my @seq = split(//,$sequence_actually_observed);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3543 my @genomic = split(//,$genomic_sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3544 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3545 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3546 ### CpG, CHH or CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3547
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3548 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3549 ### . for bases not involving cytosines ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3550 ### X for methylated C in CHG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3551 ### x for not methylated C in CHG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3552 ### H for methylated C in CHH context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3553 ### h for not methylated C in CHH context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3554 ### Z for methylated C in CpG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3555 ### z for not methylated C in CpG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3556 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3557
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3558 my @match =();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3559 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3560 my $methyl_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3561 my $methyl_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3562 my $methyl_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3563 my $unmethylated_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3564 my $unmethylated_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3565 my $unmethylated_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3566
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3567 if ($read_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3568 for my $index (0..$#seq) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3569 if ($seq[$index] eq $genomic[$index]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3570 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3571 if ($genomic[$index] eq 'C') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3572 ### If the residue is a C we want to know if it was in CpG context or in any other context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3573 my $downstream_base = $genomic[$index+1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3574
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3575 if ($downstream_base eq 'G'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3576 ++$methyl_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3577 push @match,'Z'; # protected C, methylated, in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3578 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3579
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3580 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3581 ### C in not in CpG-context, determining the second downstream base context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3582 my $second_downstream_base = $genomic[$index+2];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3583
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3584 if ($second_downstream_base eq 'G'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3585 ++$methyl_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3586 push @match,'X'; # protected C, methylated, in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3587 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3588 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3589 ++$methyl_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3590 push @match,'H'; # protected C, methylated, in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3591 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3592 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3593 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3594 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3595 push @match, '.';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3596 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3597 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3598 elsif ($seq[$index] ne $genomic[$index]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3599 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3600 ### in the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3601 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3602 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3603 my $downstream_base = $genomic[$index+1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3604
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3605 if ($downstream_base eq 'G'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3606 ++$unmethylated_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3607 push @match,'z'; # converted C, not methylated, in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3608 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3609
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3610 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3611 ### C in not in CpG-context, determining the second downstream base context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3612 my $second_downstream_base = $genomic[$index+2];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3613
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3614 if ($second_downstream_base eq 'G'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3615 ++$unmethylated_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3616 push @match,'x'; # converted C, not methylated, in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3617 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3618 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3619 ++$unmethylated_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3620 push @match,'h'; # converted C, not methylated, in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3621 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3622 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3623 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3624 ### all other mismatches are not of interest for a methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3625 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3626 push @match,'.';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3627 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3628 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3629 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3630 die "There can be only 2 possibilities\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3631 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3632 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3633 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3634 elsif ($read_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3635 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3636
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3637 for my $index (0..$#seq) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3638 if ($seq[$index] eq $genomic[$index+2]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3639 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3640 if ($genomic[$index+2] eq 'G') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3641 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3642 ### to look if the base upstream is a C
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3643
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3644 my $upstream_base = $genomic[$index+1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3645
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3646 if ($upstream_base eq 'C'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3647 ++$methyl_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3648 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3649 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3650
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3651 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3652 ### C in not in CpG-context, determining the second upstream base context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3653 my $second_upstream_base = $genomic[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3654
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3655 if ($second_upstream_base eq 'C'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3656 ++$methyl_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3657 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3658 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3659 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3660 ++$methyl_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3661 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3662 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3663 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3664 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3665 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3666 push @match, '.';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3667 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3668 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3669 elsif ($seq[$index] ne $genomic[$index+2]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3670 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3671 ### on the opposing strand, so G to A conversions in the actually observed sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3672 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3673 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3674 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3675
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3676 my $upstream_base = $genomic[$index+1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3677
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3678 if ($upstream_base eq 'C'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3679 ++$unmethylated_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3680 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3681 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3682
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3683 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3684 ### C in not in CpG-context, determining the second upstream base context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3685 my $second_upstream_base = $genomic[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3686
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3687 if ($second_upstream_base eq 'C'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3688 ++$unmethylated_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3689 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3690 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3691 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3692 ++$unmethylated_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3693 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3694 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3695 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3696 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3697 ### all other mismatches are not of interest for a methylation call
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3698 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3699 push @match,'.';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3700 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3701 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3702 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3703 die "There can be only 2 possibilities\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3704 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3705 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3706 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3707 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3708 die "Strand conversion info is required to perform a methylation call\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3709 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3710
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3711 my $methylation_call = join ("",@match);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3712
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3713 $counting{total_meCHH_count} += $methyl_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3714 $counting{total_meCHG_count} += $methyl_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3715 $counting{total_meCpG_count} += $methyl_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3716 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3717 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3718 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3719
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3720 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3721 return $methylation_call;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3722 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3723
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3724 sub read_genome_into_memory{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3725 ## working directoy
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3726 my $cwd = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3727 ## reading in and storing the specified genome in the %chromosomes hash
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3728 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3729 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3730
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3731 my @chromosome_filenames = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3732
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3733 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3734 unless (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3735 @chromosome_filenames = <*.fasta>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3736 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3737
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3738 unless (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3739 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3740 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3741
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3742 foreach my $chromosome_filename (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3743
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3744 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3745 ### first line needs to be a fastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3746 my $first_line = <CHR_IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3747 chomp $first_line;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3748 $first_line =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3749
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3750 ### Extracting chromosome name from the FastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3751 my $chromosome_name = extract_chromosome_name($first_line);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3752
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3753 my $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3754 while (<CHR_IN>){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3755 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3756 $_ =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3757 if ($_ =~ /^>/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3758 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3759 if (exists $chromosomes{$chromosome_name}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3760 print "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3761 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3762 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3763 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3764 if (length($sequence) == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3765 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3766 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3767 print "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3768 $chromosomes{$chromosome_name} = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3769 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3770 ### resetting the sequence variable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3771 $sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3772 ### setting new chromosome name
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3773 $chromosome_name = extract_chromosome_name($_);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3774 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3775 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3776 $sequence .= uc$_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3777 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3778 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3779
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3780 if (exists $chromosomes{$chromosome_name}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3781 print "chr $chromosome_name (",length $sequence ," bp)\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3782 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3783 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3784 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3785 if (length($sequence) == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3786 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3787 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3788 print "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3789 $chromosomes{$chromosome_name} = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3790 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3791 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3792 print "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3793 chdir $cwd or die "Failed to move to directory $cwd\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3794 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3795
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3796 sub extract_chromosome_name {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3797 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3798 my $fasta_header = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3799 if ($fasta_header =~ s/^>//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3800 my ($chromosome_name) = split (/\s+/,$fasta_header);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3801 return $chromosome_name;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3802 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3803 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3804 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3805 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3806 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3807
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3808 sub reverse_complement{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3809 my $sequence = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3810 $sequence =~ tr/CATG/GTAC/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3811 $sequence = reverse($sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3812 return $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3813 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3814
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3815 sub biTransformFastAFiles {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3816 my $file = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3817 my ($dir,$filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3818 if ($file =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3819 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3820 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3821 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3822 $filename = $file;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3823 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3824
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3825 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3826 if ($file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3827 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3828 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3829 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3830 open (IN,$file) or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3831 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3832
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3833 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3834 warn "Skipping the first $skip reads from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3835 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3836 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3837 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3838 warn "Processing reads up to sequence no. $upto from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3839 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3840 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3841
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3842 my $C_to_T_infile = my $G_to_A_infile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3843 $C_to_T_infile =~ s/$/_C_to_T.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3844 $G_to_A_infile =~ s/$/_G_to_A.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3845 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3846 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3847
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3848 unless ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3849 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3850 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3851 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3852
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3853 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3854 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3855 my $header = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3856 my $sequence= <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3857 last unless ($header and $sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3858
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3859 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3860
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3861 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3862
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3863 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3864 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3865 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3866 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3867 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3868 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3869
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3870 $sequence = uc$sequence; # make input file case insensitive
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3871
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3872 # detecting if the input file contains tab stops, as this is likely to result in no alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3873 if (index($header,"\t") != -1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3874 $seqID_contains_tabs++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3875 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3876
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3877 ### small check if the sequence seems to be in FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3878 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3879
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3880 my $sequence_C_to_T = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3881 $sequence_C_to_T =~ tr/C/T/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3882 print CTOT "$header$sequence_C_to_T";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3883
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3884 unless ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3885 my $sequence_G_to_A = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3886 $sequence_G_to_A =~ tr/G/A/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3887 print GTOA "$header$sequence_G_to_A";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3888 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3889 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3890 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3891 print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3892 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3893 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3894 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3895 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3896 return ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3897 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3898
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3899 sub biTransformFastAFiles_paired_end {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3900 my ($file,$read_number) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3901
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3902 my ($dir,$filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3903 if ($file =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3904 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3905 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3906 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3907 $filename = $file;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3908 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3909
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3910 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3911 if ($file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3912 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3913 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3914 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3915 open (IN,$file) or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3916 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3917
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3918 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3919 warn "Skipping the first $skip reads from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3920 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3921 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3922 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3923 warn "Processing reads up to sequence no. $upto from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3924 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3925 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3926
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3927 my $C_to_T_infile = my $G_to_A_infile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3928 $C_to_T_infile =~ s/$/_C_to_T.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3929 $G_to_A_infile =~ s/$/_G_to_A.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3930
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3931 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3932 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3933 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3934 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3935 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3936 elsif ($read_number == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3937 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3938 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3939 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3940 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3941 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3942 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3943 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3944 else{ # all four strand output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3945 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3946 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3947 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3948 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3949 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3951 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3952
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3953 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3954 my $header = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3955 my $sequence= <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3956 last unless ($header and $sequence);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3957
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3958 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3959
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3960 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3961
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3962 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3963 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3964 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3965 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3966 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3967 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3968
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3969 $sequence = uc$sequence; # make input file case insensitive
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3970
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3971 # detecting if the input file contains tab stops, as this is likely to result in no alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3972 if (index($header,"\t") != -1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3973 $seqID_contains_tabs++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3974 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3975
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3976 ## small check if the sequence seems to be in FastA format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3977 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3978
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3979 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3980 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3981 $header =~ s/$/\/1\/1/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3982 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3983 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3984 $header =~ s/$/\/1/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3985 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3986 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3987 elsif ($read_number == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3988 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3989 $header =~ s/$/\/2\/2/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3990 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3991 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3992 $header =~ s/$/\/2/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3993 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3994 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3995 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3996 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3997 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3998 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3999
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4000 $sequence_C_to_T =~ tr/C/T/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4001 $sequence_G_to_A =~ tr/G/A/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4002
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4003 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4004
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4005 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4006 print CTOT "$header$sequence_C_to_T";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4007 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4008 elsif ($read_number == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4009 print GTOA "$header$sequence_G_to_A";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4010 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4011 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4012 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4013 print CTOT "$header$sequence_C_to_T";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4014 print GTOA "$header$sequence_G_to_A";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4015 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4016 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4017
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4018 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4019 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4020 print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4021 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4022 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4023 print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4024 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4025 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4026 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4027 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4028 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4029
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4030 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4031 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4032 return ($C_to_T_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4033 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4034 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4035 return ($G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4036 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4037 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4038 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4039 return ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4040 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4041 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4042
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4043
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4044 sub biTransformFastQFiles {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4045 my $file = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4046 my ($dir,$filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4047 if ($file =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4048 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4049 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4050 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4051 $filename = $file;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4052 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4053
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4054 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4055 if ($file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4056 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4057 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4058 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4059 open (IN,$file) or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4060 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4061
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4062 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4063 warn "Skipping the first $skip reads from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4064 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4065 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4066 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4067 warn "Processing reads up to sequence no. $upto from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4068 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4069 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4070
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4071 my $C_to_T_infile = my $G_to_A_infile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4072
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4073 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4074 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4075 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4076
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4077 unless ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4078 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4079 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4080 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4081 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4082
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4083 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4084 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4085 my $identifier = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4086 my $sequence = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4087 my $identifier2 = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4088 my $quality_score = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4089 last unless ($identifier and $sequence and $identifier2 and $quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4090
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4091 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4092
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4093 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4094
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4095 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4096 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4097 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4098 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4099 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4100 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4101
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4102 $sequence = uc$sequence; # make input file case insensitive
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4103
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4104 # detecting if the input file contains tab stops, as this is likely to result in no alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4105 if (index($identifier,"\t") != -1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4106 $seqID_contains_tabs++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4107 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4108
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4109 ## small check if the sequence file appears to be a FastQ file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4110 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4111 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4112 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4113
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4114 my $sequence_C_to_T = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4115 $sequence_C_to_T =~ tr/C/T/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4116 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4117
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4118 unless ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4119 my $sequence_G_to_A = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4120 $sequence_G_to_A =~ tr/G/A/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4121 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4122 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4123 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4124
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4125 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4126 print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4127 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4128 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4129 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4130 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4131
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4132 return ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4133 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4134
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4135 sub biTransformFastQFiles_paired_end {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4136 my ($file,$read_number) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4137 my ($dir,$filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4138
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4139 if ($file =~ /\//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4140 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4141 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4142 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4143 $filename = $file;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4144 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4145
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4146 ### gzipped version of the infile
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4147 if ($file =~ /\.gz$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4148 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4149 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4150 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4151 open (IN,$file) or die "Couldn't read from file $file: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4152 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4153
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4154 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4155 warn "Skipping the first $skip reads from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4156 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4157 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4158 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4159 warn "Processing reads up to sequence no. $upto from $file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4160 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4161 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4162
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4163 my $C_to_T_infile = my $G_to_A_infile = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4164 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4165 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4166
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4167 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4168 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4169 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4170 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4171 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4172 elsif ($read_number == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4173 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4174 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4175 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4176 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4177 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4178 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4179 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4180 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4181 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4182 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4183 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4184 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4185 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4186
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4187 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4188
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4189 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4190 my $identifier = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4191 my $sequence = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4192 my $identifier2 = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4193 my $quality_score = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4194 last unless ($identifier and $sequence and $identifier2 and $quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4195 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4196
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4197 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4198
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4199 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4200 next unless ($count > $skip);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4201 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4202 if ($upto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4203 last if ($count > $upto);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4204 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4205
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4206 $sequence= uc$sequence; # make input file case insensitive
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4207
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4208 ## small check if the sequence file appears to be a FastQ file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4209 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4210 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4211 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4212 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4213
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4214 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4215 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4216 $identifier =~ s/$/\/1\/1/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4217 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4218 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4219 $identifier =~ s/$/\/1/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4220 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4221 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4222 elsif ($read_number == 2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4223 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4224 $identifier =~ s/$/\/2\/2/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4225 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4226 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4227 $identifier =~ s/$/\/2/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4228 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4229 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4230 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4231 die "Read number needs to be 1 or 2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4232 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4233
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4234 $sequence_C_to_T =~ tr/C/T/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4235 $sequence_G_to_A =~ tr/G/A/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4236
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4237 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4238 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4239 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4240 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4241 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4242 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4243 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4244 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4245 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4246 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4247 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4248 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4249 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4250
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4251 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4252 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4253 print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4254 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4255 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4256 print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4257 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4258 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4259 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4260 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4261 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4262 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4263 if ($read_number == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4264 return ($C_to_T_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4265 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4266 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4267 return ($G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4268 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4269 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4270 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4271 return ($C_to_T_infile,$G_to_A_infile);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4273 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4274
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4275 sub fix_IDs{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4276 my $id = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4277 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4278 return $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4279 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4280
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4281 sub ensure_sensical_alignment_orientation_single_end{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4282 my $index = shift; # index number if the sequence produced an alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4283 my $strand = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4284 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4285 my $orientation = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4286 ##############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4287 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4288 ## here we only want reads in the forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4289 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4290 ### if the alignment is (+) we count it, and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4291 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4292 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4293 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4294 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4295 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4296 ### if the orientation equals (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4297 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4298 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4299 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4300 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4301 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4302 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4303 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4304 ## here we only want reads in the forward (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4305 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4306 ### if the alignment is (-) we count it and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4307 if ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4308 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4309 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4310 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4311 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4312 ### if the orientation equals (+) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4313 elsif ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4314 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4315 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4316 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4317 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4318 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4319 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4320 ## here we only want reads in the forward (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4321 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4322 ### if the alignment is (-) we count it and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4323 if ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4324 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4325 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4326 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4327 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4328 ### if the orientation equals (+) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4329 elsif ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4330 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4331 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4332 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4333 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4334 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4335 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4336 ## here we only want reads in the forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4337 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4338 ### if the alignment is (+) we count it and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4339 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4340 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4341 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4342 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4343 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4344 ### if the orientation equals (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4345 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4346 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4347 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4348 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4349 } else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4350 die "One of the above conditions must be true\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4351 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4352 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4353
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4354 sub ensure_sensical_alignment_orientation_paired_ends{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4355 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4356 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4357 my $orientation = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4358 ##############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4359 ## [Index 0, sequence originated from (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4360 ## CT converted read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4361 ## GA converted read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4362 ## CT converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4363 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4364 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4365 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4366 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4367 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4368 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4369 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4370 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4371 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4372 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4373 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4374 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4375 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4376 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4377 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4378 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4379 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4380 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4381 ## [Index 1, sequence originated from (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4382 ## GA converted read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4383 ## CT converted read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4384 ## GA converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4385 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4386 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4387 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4388 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4389 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4390 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4391 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4392 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4393 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4394 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4395 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4396 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4397 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4398 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4399 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4400 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4401 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4402 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4403 ## [Index 2, sequence originated from complementary to (converted) forward strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4404 ## GA converted read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4405 ## CT converted read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4406 ## CT converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4407 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4408 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4409 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4410 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4411 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4412 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4413 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4414 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4415 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4416 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4417 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4418 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4419 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4420 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4421 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4422 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4423 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4424 ###############################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4425 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4426 ## CT converted read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4427 ## GA converted read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4428 ## GA converted genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4429 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4430 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4431 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4432 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4433 $fhs[$index]->{seen}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4434 $orientation = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4435 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4436 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4437 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4438 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4439 $fhs[$index]->{wrong_strand}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4440 return $orientation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4441 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4442 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4443 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4444 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4445 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4446 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4447 die "One of the above conditions must be true\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4448 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4449 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4450
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4451 #####################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4452
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4453 ### Bowtie 1 (default) | PAIRED-END | FASTA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4454
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4455 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4456
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4457 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4458
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4459 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4460 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4461 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4462 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4463 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4464 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4465
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4466 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4467 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4468 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4469 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4470 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4471 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4472 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4473 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4474
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4475 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4476
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4477 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4478 unless ($fh->{inputfile_1}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4479 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4480 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4481 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4482 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4483 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4484 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4485
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4486 my $bt_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4487 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4488 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4489 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4490 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4491 $bt_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4492 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4493
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4494 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4495 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4496
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4497 my $line_1 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4498 my $line_2 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4499
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4500 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4501 if ($line_1 and $line_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4502 chomp $line_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4503 chomp $line_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4504 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4505 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4506
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4507 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4508 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4509
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4510 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4511 $fh->{last_seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4512 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4513 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4514 $fh->{last_seq_id} = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4515 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4516 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4517 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4518 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4519
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4520 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4521 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4522 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4523 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4524 # otherwise we just initialise last_seq_id and last_lines as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4525 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4526 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4527 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4528 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4529 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4530 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4531 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4532 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4533
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4534 ### Bowtie 2 | PAIRED-END | FASTA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4535
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4536 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4537 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4538 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4539 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4540 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4541 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4542 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4543 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4544
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4545 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4546 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4547 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4548 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4549 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4550 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4551 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4552 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4553
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4554 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4555
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4556 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4557 unless ($fh->{inputfile_1}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4558 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4559 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4560 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4561 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4562 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4563 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4564
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4565 my $bt2_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4566 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4567 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4568 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4569 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4570 $bt2_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4571 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4572
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4573 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4574 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4575
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4576 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4577 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4578 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4579 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4580 last unless ($_ =~ /^\@/); # SAM headers start with @
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4581 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4582 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4583 last; # no alignment output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4584 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4585 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4586
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4587 my $line_1 = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4588 my $line_2 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4589
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4590 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4591 if ($line_1 and $line_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4592 chomp $line_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4593 chomp $line_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4594 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4595 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4596
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4597 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4598 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4599
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4600 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4601 $fh->{last_seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4602 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4603 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4604 $fh->{last_seq_id} = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4605 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4606 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4607 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4608 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4609
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4610 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4611 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4612 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4613 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4614 # otherwise we just initialise last_seq_id and last_lines as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4615 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4616 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4617 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4618 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4619 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4620 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4621 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4622 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4623
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4624 ### Bowtie 1 (default) | PAIRED-END | FASTQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4625
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4626 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4627 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4628 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4629 print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4630 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4631 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4632 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4633 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4634
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4635 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4636 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4637 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4638 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4639 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4640 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4641 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4642 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4643
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4644 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4645
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4646 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4647 unless ($fh->{inputfile_1}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4648 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4649 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4650 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4651 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4652 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4653 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4654
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4655 my $bt_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4656 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4657 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4658 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4659 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4660 $bt_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4661 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4662
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4663 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4664 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4665
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4666 my $line_1 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4667 my $line_2 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4668
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4669 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4670 if ($line_1 and $line_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4671 chomp $line_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4672 chomp $line_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4673 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4674 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4675
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4676 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4677 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4678
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4679 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4680 $fh->{last_seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4681 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4682 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4683 $fh->{last_seq_id} = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4684 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4685 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4686 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4687 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4688
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4689 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4690 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4691 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4692 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4693
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4694 # otherwise we just initialise last_seq_id and last_lines as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4695 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4696 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4697 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4698 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4699 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4700 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4701 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4702 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4703
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4704 ### Bowtie 2 | PAIRED-END | FASTQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4705
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4706 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4707 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4708 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4709 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4710 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4711 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4712 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4713 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4714
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4715 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4716 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4717 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4718 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4719 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4720 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4721 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4722 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4723
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4724 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4725
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4726 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4727 unless ($fh->{inputfile_1}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4728 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4729 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4730 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4731 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4732 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4733 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4734
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4735 my $bt2_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4736 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4737 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4738 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4739 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4740 $bt2_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4741 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4742
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4743 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4744 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4745
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4746 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4747 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4748 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4749 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4750 last unless ($_ =~ /^\@/); # SAM headers start with @
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4751 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4752 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4753 last; # no alignment output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4754 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4755 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4756
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4757 my $line_1 = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4758 my $line_2 = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4759
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4760 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4761 if ($line_1 and $line_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4762 chomp $line_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4763 chomp $line_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4764 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4765 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4766
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4767 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4768 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4769
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4770 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4771 $fh->{last_seq_id} = $id_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4772 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4773 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4774 $fh->{last_seq_id} = $id_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4775 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4776 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4777 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4778 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4779
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4780 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4781 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4782 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4783 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4784
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4785 # otherwise we just initialise last_seq_id and last_lines as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4786 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4787 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4788 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4789 $fh->{last_line_1} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4790 $fh->{last_line_2} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4791 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4792 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4793 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4794
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4795 #####################################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4796
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4797 ### Bowtie 1 (default) | SINGLE-END | FASTA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4798 sub single_end_align_fragments_to_bisulfite_genome_fastA {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4799 my ($C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4800 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4801 print "Input file is $C_to_T_infile (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4802 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4803 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4804 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4805 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4806
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4807 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4808 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4809 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4810 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4811 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4812 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4813 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4814 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4815
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4816 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4817
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4818 my $bt_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4819 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4820 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4821 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4822 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4823 $bt_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4824 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4825
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4826 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4827 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4828
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4829 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4830 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4831 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4832 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4833 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4834 $fh->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4835 $fh->{last_line} = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4836 warn "Found first alignment:\t$fh->{last_line}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4837 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4838 # otherwise we just initialise last_seq_id and last_line as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4839 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4840 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4841 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4842 $fh->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4843 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4844 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4845 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4846
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4847 ### Bowtie 2 | SINGLE-END | FASTA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4848 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4849 my ($C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4850 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4851 print "Input file is $C_to_T_infile (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4852 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4853 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4854 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4855 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4856
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4857 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4858 ## data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4859 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4860 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4861 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4862 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4863 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4864 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4865
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4866 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4867
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4868 my $bt2_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4869 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4870 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4871 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4872 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4873 $bt2_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4874 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4875
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4876 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4877 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4878
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4879 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4880 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4881 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4882 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4883 last unless ($_ =~ /^\@/); # SAM headers start with @
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4884 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4885 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4886 last; # no alignment output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4887 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4888 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4889
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4890 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4891 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4892 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4893 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4894 $fh->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4895 $fh->{last_line} = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4896 warn "Found first alignment:\t$fh->{last_line}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4897 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4898 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4899 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4900 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4901 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4902 $fh->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4903 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4904 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4905 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4906
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4907
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4908 ### Bowtie 1 (default) | SINGLE-END | FASTQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4909 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4910 my ($C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4911 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4912 print "Input file is $C_to_T_infile (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4913 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4914 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4915 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4916 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4917
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4918 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4919 ## the data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4920 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4921 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4922 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4923 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4924 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4925 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4926
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4927 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4928 my $bt_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4929 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4930 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4931 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4932 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4933 $bt_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4934 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4935
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4936 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4937 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4938
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4939 # if Bowtie produces an alignment we store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4940 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4941 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4942 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4943 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4944 $fh->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4945 $fh->{last_line} = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4946 warn "Found first alignment:\t$fh->{last_line}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4947 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4948 # otherwise we just initialise last_seq_id and last_line as undefined
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4949 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4950 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4951 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4952 $fh->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4953 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4954 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4955 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4956
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4957 ### Bowtie 2 | SINGLE-END | FASTQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4958 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4959 my ($C_to_T_infile,$G_to_A_infile) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4960 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4961 print "Input file is $C_to_T_infile (FastQ)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4962 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4963 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4964 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4965 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4966
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4967 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4968 ## the data structure above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4969 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4970 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4971 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4972 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4973 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4974 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4975
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4976 foreach my $fh (@fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4977 my $bt2_options = $bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4978 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4979 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4980 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4981 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4982 $bt2_options .= ' --nofw';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4983 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4984 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4985 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4986
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4987 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4988 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4989 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4990 $_ = $fh->{fh}->getline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4991 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4992 last unless ($_ =~ /^\@/); # SAM headers start with @
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4993 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4994 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4995 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4996 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4997 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4998
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4999 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5000 if ($_) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5001 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5002 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5003 $fh->{last_seq_id} = $id;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5004 $fh->{last_line} = $_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5005 warn "Found first alignment:\t$fh->{last_line}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5006 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5007 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5008 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5009 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5010 $fh->{last_seq_id} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5011 $fh->{last_line} = undef;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5012 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5013 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5014 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5015
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5016 ###########################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5017
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5018 sub reset_counters_and_fhs{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5019 my $filename = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5020 %counting=(
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5021 total_meCHH_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5022 total_meCHG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5023 total_meCpG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5024 total_unmethylated_CHH_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5025 total_unmethylated_CHG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5026 total_unmethylated_CpG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5027 sequences_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5028 no_single_alignment_found => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5029 unsuitable_sequence_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5030 genomic_sequence_could_not_be_extracted_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5031 unique_best_alignment_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5032 low_complexity_alignments_overruled_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5033 CT_CT_count => 0, #(CT read/CT genome, original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5034 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5035 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5036 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5037 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5038 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5039 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5040 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5041 alignments_rejected_count => 0, # only relevant if --directional was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5042 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5043
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5044 if ($directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5045 if ($filename =~ ','){ # paired-end files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5046 @fhs=(
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5047 { name => 'CTreadCTgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5048 strand_identity => 'con ori forward',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5049 bisulfiteIndex => $CT_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5050 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5051 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5052 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5053 { name => 'CTreadGAgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5054 strand_identity => 'con ori reverse',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5055 bisulfiteIndex => $GA_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5056 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5057 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5058 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5059 { name => 'GAreadCTgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5060 strand_identity => 'compl ori con forward',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5061 bisulfiteIndex => $CT_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5062 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5063 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5064 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5065 { name => 'GAreadGAgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5066 strand_identity => 'compl ori con reverse',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5067 bisulfiteIndex => $GA_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5068 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5069 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5070 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5071 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5072 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5073 else{ # single-end files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5074 @fhs=(
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5075 { name => 'CTreadCTgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5076 strand_identity => 'con ori forward',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5077 bisulfiteIndex => $CT_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5078 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5079 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5080 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5081 { name => 'CTreadGAgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5082 strand_identity => 'con ori reverse',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5083 bisulfiteIndex => $GA_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5084 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5085 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5086 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5087 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5088 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5089 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5090 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5091 @fhs=(
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5092 { name => 'CTreadCTgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5093 strand_identity => 'con ori forward',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5094 bisulfiteIndex => $CT_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5095 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5096 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5097 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5098 { name => 'CTreadGAgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5099 strand_identity => 'con ori reverse',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5100 bisulfiteIndex => $GA_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5101 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5102 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5103 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5104 { name => 'GAreadCTgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5105 strand_identity => 'compl ori con forward',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5106 bisulfiteIndex => $CT_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5107 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5108 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5109 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5110 { name => 'GAreadGAgenome',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5111 strand_identity => 'compl ori con reverse',
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5112 bisulfiteIndex => $GA_index_basename,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5113 seen => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5114 wrong_strand => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5115 },
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5116 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5117 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5118 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5119
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5120
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5121 sub process_command_line{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5122 my @bowtie_options;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5123 my $help;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5124 my $mates1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5125 my $mates2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5126 my $path_to_bowtie;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5127 my $fastq;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5128 my $fasta;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5129 my $skip;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5130 my $qupto;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5131 my $phred64;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5132 my $phred33;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5133 my $solexa;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5134 my $mismatches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5135 my $seed_length;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5136 my $best;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5137 my $sequence_format;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5138 my $version;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5139 my $quiet;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5140 my $chunk;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5141 my $non_directional;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5142 my $ceiling;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5143 my $maxins;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5144 my $minins;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5145 my $unmapped;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5146 my $multi_map;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5147 my $output_dir;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5148 my $bowtie2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5149 my $vanilla;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5150 my $sam_no_hd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5151 my $seed_extension_fails;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5152 my $reseed_repetitive_seeds;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5153 my $most_valid_alignments;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5154 my $score_min;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5155 my $parallel;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5156 my $temp_dir;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5157 my $rdg;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5158 my $rfg;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5159
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5160 my $command_line = GetOptions ('help|man' => \$help,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5161 '1=s' => \$mates1,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5162 '2=s' => \$mates2,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5163 'path_to_bowtie=s' => \$path_to_bowtie,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5164 'f|fasta' => \$fasta,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5165 'q|fastq' => \$fastq,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5166 's|skip=i' => \$skip,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5167 'u|upto=i' => \$qupto,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5168 'phred33-quals' => \$phred33,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5169 'phred64-quals|solexa1' => \$phred64,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5170 'solexa-quals' => \$solexa,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5171 'n|seedmms=i' => \$mismatches,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5172 'l|seedlen=i' => \$seed_length,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5173 'no_best' => \$best,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5174 'version' => \$version,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5175 'quiet' => \$quiet,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5176 'chunkmbs=i' => \$chunk,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5177 'non_directional' => \$non_directional,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5178 'I|minins=i' => \$minins,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5179 'X|maxins=i' => \$maxins,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5180 'e|maqerr=i' => \$ceiling,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5181 'un|unmapped' => \$unmapped,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5182 'ambiguous' => \$multi_map,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5183 'o|output_dir=s' => \$output_dir,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5184 'bowtie2' => \$bowtie2,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5185 'vanilla' => \$vanilla,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5186 'sam-no-hd' => \$sam_no_hd,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5187 'D=i' => \$seed_extension_fails,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5188 'R=i' => \$reseed_repetitive_seeds,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5189 'score_min=s' => \$score_min,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5190 'most_valid_alignments=i' => \$most_valid_alignments,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5191 'p=i' => \$parallel,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5192 'temp_dir=s' => \$temp_dir,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5193 'rdg=s' => \$rdg,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5194 'rfg=s' => \$rfg,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5195 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5196
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5197
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5198 ### EXIT ON ERROR if there were errors with any of the supplied options
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5199 unless ($command_line){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5200 die "Please respecify command line options\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5201 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5202 ### HELPFILE
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5203 if ($help){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5204 print_helpfile();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5205 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5206 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5207 if ($version){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5208 print << "VERSION";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5209
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5210
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5211 Bismark - Bisulfite Mapper and Methylation Caller.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5212
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5213 Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5214 www.bioinformatics.babraham.ac.uk/projects/
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5215
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5216
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5217 VERSION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5218 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5219 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5220
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5221
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5222 ##########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5223 ### PROCESSING OPTIONS ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5224 ##########################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5225
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5226 unless ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5227 $bowtie2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5228 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5229 unless ($sam_no_hd){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5230 $sam_no_hd =0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5231 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5232
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5233 ### PATH TO BOWTIE
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5234 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5235 if ($path_to_bowtie){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5236 unless ($path_to_bowtie =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5237 $path_to_bowtie =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5238 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5239 if (-d $path_to_bowtie){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5240 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5241 $path_to_bowtie = "${path_to_bowtie}bowtie2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5242 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5243 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5244 $path_to_bowtie = "${path_to_bowtie}bowtie";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5245 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5246 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5247 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5248 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5249 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5250 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5251 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5252 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5253 $path_to_bowtie = 'bowtie2';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5254 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5255 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5256 $path_to_bowtie = 'bowtie';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5257 warn "Path to Bowtie specified as: $path_to_bowtie\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5258 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5259 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5260
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5261 ####################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5262 ### PROCESSING ARGUMENTS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5263
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5264 ### GENOME FOLDER
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5265 my $genome_folder = shift @ARGV; # mandatory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5266 unless ($genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5267 warn "Genome folder was not specified!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5268 print_helpfile();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5269 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5270 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5271
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5272 ### checking that the genome folder, all subfolders and the required bowtie index files exist
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5273 unless ($genome_folder =~/\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5274 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5275 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5276
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5277 if (chdir $genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5278 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5279 unless ($absolute_genome_folder =~/\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5280 $absolute_genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5281 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5282 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5283 $genome_folder = $absolute_genome_folder;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5284 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5285 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5286 die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5287 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5288
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5289 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5290 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5291
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5292 if ($bowtie2){ ### Bowtie 2 (new)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5293 ### checking the integrity of $CT_dir
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5294 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5295 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5296 foreach my $file(@CT_bowtie_index){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5297 unless (-f $file){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5298 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5299 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5300 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5301 ### checking the integrity of $GA_dir
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5302 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5303 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5304 foreach my $file(@GA_bowtie_index){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5305 unless (-f $file){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5306 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5307 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5308 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5309 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5310
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5311 else{ ### Bowtie 1 (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5312 ### checking the integrity of $CT_dir
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5313 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5314 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5315 foreach my $file(@CT_bowtie_index){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5316 unless (-f $file){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5317 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5318 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5319 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5320 ### checking the integrity of $GA_dir
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5321 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5322 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5323 foreach my $file(@GA_bowtie_index){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5324 unless (-f $file){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5325 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5326 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5327 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5328 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5329
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5330 my $CT_index_basename = "${CT_dir}BS_CT";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5331 my $GA_index_basename = "${GA_dir}BS_GA";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5332
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5333 ### INPUT OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5334
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5335 ### SEQUENCE FILE FORMAT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5336 ### exits if both fastA and FastQ were specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5337 if ($fasta and $fastq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5338 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5339 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5340
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5341 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5342 if ($fasta){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5343 print "FastA format specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5344 $sequence_format = 'FASTA';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5345 push @bowtie_options, '-f';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5346 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5347 elsif ($fastq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5348 print "FastQ format specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5349 $sequence_format = 'FASTQ';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5350 push @bowtie_options, '-q';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5351 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5352 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5353 $fastq = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5354 print "FastQ format assumed (by default)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5355 $sequence_format = 'FASTQ';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5356 push @bowtie_options, '-q';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5357 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5358
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5359 ### SKIP
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5360 if ($skip){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5361 warn "Skipping the first $skip reads from the input file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5362 # push @bowtie_options,"-s $skip";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5363 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5364
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5365 ### UPTO
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5366 if ($qupto){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5367 warn "Processing sequences up to read no. $qupto from the input file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5368 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5369 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5370 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5371 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5372 # push @bowtie_options,"--qupto $qupto";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5373 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5374 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5375
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5376 ### QUALITY VALUES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5377 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5378 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5379 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5380 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5381 # Phred quality values work only when -q is specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5382 unless ($fastq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5383 die "Phred quality values works only when -q (FASTQ) is specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5384 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5385 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5386 push @bowtie_options,"--phred33";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5387 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5388 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5389 push @bowtie_options,"--phred33-quals";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5390 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5391 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5392 if ($phred64){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5393 # Phred quality values work only when -q is specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5394 unless ($fastq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5395 die "Phred quality values work only when -q (FASTQ) is specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5396 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5397 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5398 push @bowtie_options,"--phred64";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5399 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5400 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5401 push @bowtie_options,"--phred64-quals";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5402 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5403 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5404 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5405 $phred64 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5406 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5407
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5408 if ($solexa){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5409 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5410 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5411 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5412 # Solexa to Phred value conversion works only when -q is specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5413 unless ($fastq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5414 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5415 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5416 push @bowtie_options,"--solexa-quals";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5417 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5418 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5419 $solexa = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5420 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5421
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5422 ### ALIGNMENT OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5423
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5424 ### MISMATCHES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5425 if (defined $mismatches){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5426 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5427 if ($mismatches == 0 or $mismatches == 1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5428 push @bowtie_options,"-N $mismatches";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5429 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5430 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5431 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5432 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5433 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5434 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5435 if ($mismatches >= 0 and $mismatches <= 3){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5436 push @bowtie_options,"-n $mismatches";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5437 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5438 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5439 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5440 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5441 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5442 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5443 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5444 unless ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5445 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5446 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5447 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5448
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5449 ### SEED LENGTH
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5450 if (defined $seed_length){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5451 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5452 push @bowtie_options,"-L $seed_length";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5453 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5454 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5455 push @bowtie_options,"-l $seed_length";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5456 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5457 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5458
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5459 ### MISMATCH CEILING
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5460 if (defined $ceiling){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5461 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5462 push @bowtie_options,"-e $ceiling";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5463 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5464
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5465
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5466 ### BOWTIE 2 EFFORT OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5467
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5468 ### CONSECUTIVE SEED EXTENSION FAILS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5469 if (defined $seed_extension_fails){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5470 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5471 push @bowtie_options,"-D $seed_extension_fails";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5472 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5473
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5474 ### RE-SEEDING REPETITIVE SEEDS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5475 if (defined $reseed_repetitive_seeds){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5476 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5477 push @bowtie_options,"-R $reseed_repetitive_seeds";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5478 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5479
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5480
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5481 ### BOWTIE 2 SCORING OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5482 if ($score_min){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5483 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5484 unless ($score_min =~ /^L,.+,.+$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5485 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5486 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5487 push @bowtie_options,"--score-min $score_min";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5488 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5489 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5490 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5491 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5492 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5493 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5494
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5495 ### BOWTIE 2 READ GAP OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5496 if ($rdg){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5497 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5498 unless ($rdg =~ /^.+,.+$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5499 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5500 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5501 push @bowtie_options,"--rdg $rdg";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5502 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5503
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5504 ### BOWTIE 2 REFERENCE GAP OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5505 if ($rfg){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5506 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5507 unless ($rfg =~ /^.+,.+$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5508 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5509 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5510 push @bowtie_options,"--rfg $rfg";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5511 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5512
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5513
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5514
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5515 ### BOWTIE 2 PARALLELIZATION OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5516 if (defined $parallel){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5517 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5518 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5519 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5520 if ($parallel){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5521 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5522 push @bowtie_options,"-p $parallel";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5523 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5524 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5525 sleep (2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5526 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5527 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5528
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5529 ### REPORTING OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5530
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5531 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5532 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5533
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5534 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5535 if(defined $most_valid_alignments){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5536
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5537 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5538 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5539 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5540 # else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5541 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5542 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5543 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5544 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5545 push @bowtie_options,'-k 2';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5546 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5547
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5548 ### --BEST
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5549 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5550 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5551 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5552 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5553 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5554 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5555 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5556 unless ($best){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5557 push @bowtie_options,'--best';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5558 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5559 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5560
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5561 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5562 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5563 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5564 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5565 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5566 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5567 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5568 $vanilla = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5569 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5570
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5571 ### PAIRED-END MAPPING
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5572 if ($mates1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5573 my @mates1 = (split (/,/,$mates1));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5574 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5575 my @mates2 = (split(/,/,$mates2));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5576 unless (scalar @mates1 == scalar @mates2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5577 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5578 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5579 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5580 my $mate1 = shift @mates1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5581 my $mate2 = shift @mates2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5582 last unless ($mate1 and $mate2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5583 push @filenames,"$mate1,$mate2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5584 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5585 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5586 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5587 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5588 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5589 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5590 elsif ($mates2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5591 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5592 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5593
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5594 ### SINGLE-END MAPPING
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5595 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5596 my $singles;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5597 unless ($mates1 and $mates2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5598 $singles = join (',',@ARGV);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5599 unless ($singles){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5600 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5601 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5602 $singles =~ s/\s/,/g;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5603 @filenames = (split(/,/,$singles));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5604 warn "\nFiles to be analysed:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5605 warn "@filenames\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5606 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5607 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5608
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5609 ### MININUM INSERT SIZE (PAIRED-END ONLY)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5610 if (defined $minins){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5611 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5612 push @bowtie_options,"--minins $minins";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5613 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5614
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5615 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5616 if (defined $maxins){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5617 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5618 push @bowtie_options,"--maxins $maxins";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5619 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5620 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5621 unless ($singles){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5622 push @bowtie_options,'--maxins 500';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5623 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5624 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5625
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5626 ### QUIET prints nothing besides alignments (suppresses warnings)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5627 if ($quiet){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5628 push @bowtie_options,'--quiet';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5629 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5630
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5631 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5632 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5633 if (defined $chunk){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5634 push @bowtie_options,"--chunkmbs $chunk";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5635 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5636 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5637 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5638 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5639 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5640
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5641
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5642 ### SUMMARY OF ALL BOWTIE OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5643 my $bowtie_options = join (' ',@bowtie_options);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5644
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5645
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5646 ### STRAND-SPECIFIC LIBRARIES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5647 my $directional;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5648 if ($non_directional){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5649 print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5650 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5651 $directional = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5652 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5653 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5654 print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5655 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5656 $directional = 1; # Changed this to being the default behaviour
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5657 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5658
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5659 ### UNMAPPED SEQUENCE OUTPUT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5660 $unmapped = 0 unless ($unmapped);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5661
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5662 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5663 $multi_map = 0 unless ($multi_map);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5664
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5665
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5666 ### OUTPUT DIRECTORY
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5667
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5668 chdir $parent_dir or die "Failed to move back to current working directory\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5669 if ($output_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5670 unless ($output_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5671 $output_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5672 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5673
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5674 if (chdir $output_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5675 $output_dir = getcwd; # making the path absolute
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5676 unless ($output_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5677 $output_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5678 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5679 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5680 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5681 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5682 warn "Created output directory $output_dir!\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5683 chdir $output_dir or die "Failed to move to $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5684 $output_dir = getcwd; # making the path absolute
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5685 unless ($output_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5686 $output_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5687 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5688 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5689 warn "Output will be written into the directory: $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5690 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5691 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5692 $output_dir = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5693 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5694
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5695 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5696
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5697 chdir $parent_dir or die "Failed to move back to current working directory\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5698 if ($temp_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5699 warn "\nUsing temp directory: $temp_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5700 unless ($temp_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5701 $temp_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5702 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5703
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5704 if (chdir $temp_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5705 $temp_dir = getcwd; # making the path absolute
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5706 unless ($temp_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5707 $temp_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5708 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5709 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5710 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5711 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5712 warn "Created temporary directory $temp_dir!\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5713 chdir $temp_dir or die "Failed to move to $temp_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5714 $temp_dir = getcwd; # making the path absolute
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5715 unless ($temp_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5716 $temp_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5717 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5718 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5719 warn "Temporary files will be written into the directory: $temp_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5720 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5721 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5722 $temp_dir = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5723 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5724
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5725
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5726 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5727 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5728
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5729
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5730
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5731 sub generate_SAM_header{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5732 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5733 foreach my $chr (keys %chromosomes){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5734 my $length = length ($chromosomes{$chr});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5735 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5736 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5737 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5738 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5739
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5740 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5741 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5742
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5743 sub single_end_SAM_output{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5744 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5745 my $strand = $methylation_call_params->{$id}->{alignment_strand};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5746 my $chr = $methylation_call_params->{$id}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5747 my $start = $methylation_call_params->{$id}->{position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5748 my $stop = $methylation_call_params->{$id}->{end_position};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5749 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5750 my $methcall = $methylation_call_params->{$id}->{methylation_call};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5751 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5752 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5753 my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5754 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5755 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5756 ## Bit Description Comment Value
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5757 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5758 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5759 ## 0x4 segment unmapped --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5760 ## 0x8 next segment in the template unmapped --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5761 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5762 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5763 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5764 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5765 ## 0x100 secondary alignment --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5766 ## 0x200 not passing quality controls --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5767 ## 0x400 PCR or optical duplicate --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5768
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5769 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5770
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5771 my $flag; # FLAG variable used for SAM format.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5772 if ($strand eq "+"){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5773 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5774 $flag = 0; # 0 for "+" strand (OT)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5775 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5776 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5777 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5778 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5779 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5780 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5781 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5782 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5783 elsif ($strand eq "-"){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5784 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5785 $flag = 16; # 16 for "-" strand (OB)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5786 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5787 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5788 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5789 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5790 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5791 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5792 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5793 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5794 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5795 die "Unexpected strand information: $strand\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5796 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5797
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5798 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5799
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5800 my $mapq = 255; # Assume mapping quality is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5801
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5802 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5803
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5804 my $cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5805 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5806 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5807 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5808 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5809 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5810 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5811
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5812 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5813
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5814 my $rnext = "*"; # Paired-end variable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5815
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5816 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5817
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5818 my $pnext = 0; # Paired-end variable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5819
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5820 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5821
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5822 my $tlen = 0; # Paired-end variable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5823
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5824 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5825
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5826 if ($read_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5827 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5828 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5829 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5830 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5831 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5832
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5833 if ($strand eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5834 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5835 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5836 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5837 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5838
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5839 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5840
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5841 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5842 # into the reference string. hemming_dist()
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5843 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5844 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5845 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5846
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5847 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5848
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5849 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5850
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5851 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5852
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5853 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5854
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5855 my $XM_tag; # Optional tag XM: Methylation Call String
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5856 if ($strand eq '+'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5857 $XM_tag = "XM:Z:$methcall";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5858 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5859 elsif ($strand eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5860 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5861 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5862
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5863 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5864
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5865 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5866
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5867 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5868
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5869 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5870
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5871 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5872
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5873 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5874 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5875 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5876
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5877
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5878 sub paired_end_SAM_output{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5879 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5880 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5881 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5882 my $chr = $methylation_call_params->{$id}->{chromosome};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5883 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5884 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5885 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5886 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5887 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5888 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5889 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5890 my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5891 my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5892
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5893 my $id_1 = $id.'/1';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5894 my $id_2 = $id.'/2';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5895
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5896 # Allows all degenerate nucleotide sequences in reference genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5897 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5898 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5899
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5900 my $index; # used to store the srand origin of the alignment in a less convoluted way
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5901
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5902 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5903 $index = 0; ## this is OT (original top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5904 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5905 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5906 $index = 1; ## this is CTOB (complementary to OB)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5907 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5908 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5909 $index = 2; ## this is CTOT (complementary to OT)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5910 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5911 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5912 $index = 3; ## this is OB (original bottom)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5913 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5914 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5915 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5916 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5917
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5918 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5919 ### first or last position.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5920
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5921 if ($index == 0 or $index == 3){ # OT or OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5922 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5923 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5924 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5925 else{ # CTOT or CTOB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5926 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5927 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5928 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5929
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5930 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5931
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5932 my $start_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5933 my $start_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5934 # adjusting end positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5935
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5936 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5937 $start_read_1 = $methylation_call_params->{$id}->{position_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5938 $start_read_2 = $methylation_call_params->{$id}->{position_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5939 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5940 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5941 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5942 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5943 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5944 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5945 else{ # read 1 is on the - strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5946 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5947 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5948 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5949 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5951 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5952
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5953 my $end_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5954 my $end_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5955 # adjusting end positions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5956
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5957 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5958 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5959 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5960 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5961 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5962 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5963 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5964 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5965 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5966 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5967 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5968 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5969 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5970 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5971
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5972 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5973
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5974 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5975 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5976 ## Bit Description Comment Value
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5977 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5978 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5979 ## 0x4 segment unmapped --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5980 ## 0x8 next segment in the template unmapped --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5981 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5982 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5983 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5984 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5985 ## 0x100 secondary alignment --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5986 ## 0x200 not passing quality controls --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5987 ## 0x400 PCR or optical duplicate --- ---
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5988
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5989 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5990
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5991 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5992 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5993
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5994 my $flag_1; # FLAG variable used for SAM format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5995 my $flag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5996
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5997 if ($index == 0){ # OT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5998 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5999 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6000 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6001 elsif ($index == 1){ # CTOB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6002 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6003 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6004 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6005 elsif ($index == 2){ # CTOT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6006 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6007 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6008 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6009 elsif ($index == 3){ # OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6010 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6011 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6012 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6013
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6014 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6015
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6016 my $mapq = 255; # Mapping quality is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6017
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6018 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6019
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6020 my $cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6021 my $cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6022
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6023 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6024 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6025 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6026 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6027 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6028 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6029 $cigar_2 = length($actual_seq_2) . "M";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6030 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6031
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6032 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6033
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6034 my $rnext = '='; # Chromosome of mate; applies to both reads
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6035
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6036 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6037
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6038 my $pnext_1 = $start_read_2; # Leftmost position of mate
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6039 my $pnext_2 = $start_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6040
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6041 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6042
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6043 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6044 my $tlen_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6045
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6046 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6047
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6048 if ($start_read_1 <= $start_read_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6049
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6050 # Read 1 alignment is leftmost
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6051
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6052 if ($end_read_2 >= $end_read_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6053
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6054 # -------------------------> read 1 reads overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6055 # <------------------------- read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6056 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6057 # or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6058 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6059 # -------------------------> read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6060 # <----------------------- read 2 read 2 contained within read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6061 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6062 # or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6063 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6064 # -------------------------> read 1 reads 1 and 2 exactly overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6065 # <------------------------- read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6066 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6067
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6068 # dovetailing of reads is not enabled for Bowtie 2 alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6069
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6070 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6071 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6072 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6073 elsif ($end_read_2 < $end_read_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6074
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6075 # -------------------------> read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6076 # <----------- read 2 read 2 contained within read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6077 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6078 # or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6079 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6080 # -------------------------> read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6081 # <----------- read 2 read 2 contained within read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6082
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6083 # start and end of read 2 are fully contained within read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6084 $tlen_1 = 0; # Set as 0 when the information is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6085 $tlen_2 = 0; # Set as 0 when the information is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6086 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6087
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6088 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6089
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6090 elsif ($start_read_2 < $start_read_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6091
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6092 if ($end_read_1 >= $end_read_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6093
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6094 # Read 2 alignment is leftmost
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6095
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6096 # -------------------------> read 2 reads overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6097 # <------------------------- read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6098 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6099 # or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6100 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6101 # -------------------------> read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6102 # <----------------------- read 1 read 1 contained within read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6103 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6104 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6105
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6106 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6107 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6108 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6109 elsif ($end_read_1 < $end_read_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6110
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6111 # -------------------------> read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6112 # <----------- read 1 read 1 contained within read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6113 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6114 # or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6115 #
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6116 # -------------------------> read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6117 # <----------- read 1 read 1 contained within read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6118
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6119 # start and end of read 1 are fully contained within read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6120 $tlen_1 = 0; # Set as 0 when the information is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6121 $tlen_2 = 0; # Set as 0 when the information is unavailable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6122 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6123 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6124 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6125
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6126 else{ # Bowtie 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6127
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6128 if ($end_read_2 >= $end_read_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6129 # Read 1 alignment is leftmost
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6130 # -------------------------> read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6131 # <------------------------- read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6132 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6133
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6134 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6135 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6136 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6137 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6138 # Read 2 alignment is leftmost
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6139 # -------------------------> read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6140 # <------------------------- read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6141 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6142
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6143 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6144 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6145 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6146 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6147
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6148 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6149
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6150 # adjusting the strand of the sequence before we use them to generate mismatch strings
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6151 if ($strand_1 eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6152 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6153 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6154 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6155 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6156 if ($strand_2 eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6157 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6158 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6159 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6160 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6161
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6162 # print "$actual_seq_1\n$ref_seq_1\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6163 # print "$actual_seq_2\n$ref_seq_2\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6164
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6165 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6166
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6167 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6168 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6169 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6170 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6171 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6172 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6173 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6174 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6175
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6176 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6177
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6178 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6179 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6180
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6181 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6182
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6183 my $XM_tag_1; # Optional tag XM: Methylation call string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6184 my $XM_tag_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6185
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6186 if ($strand_1 eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6187 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6188 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6189 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6190 $XM_tag_1 = "XM:Z:$methcall_1";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6191 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6192
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6193 if ($strand_2 eq '-'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6194 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6195 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6196 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6197 $XM_tag_2 = "XM:Z:$methcall_2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6198 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6199
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6200 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6201
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6202 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6203 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6204
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6205 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6206
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6207 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6208
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6209 #####
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6210
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6211 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6212 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6213 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6214 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6215
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6216 sub revcomp{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6217 my $seq = shift or die "Missing seq to reverse complement\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6218 $seq = reverse $seq;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6219 $seq =~ tr/ACTGactg/TGACTGAC/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6220 return $seq;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6221 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6222
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6223 sub hemming_dist{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6224 my $matches = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6225 my @actual_seq = split //,(shift @_);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6226 my @ref_seq = split //,(shift @_);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6227 foreach (0..$#actual_seq){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6228 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6229 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6230 return my $hd = scalar @actual_seq - $matches;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6231 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6232
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6233 sub make_mismatch_string{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6234 my $actual_seq = shift or die "Missing actual sequence";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6235 my $ref_seq = shift or die "Missing reference sequence";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6236 my $XX_tag = "XX:Z:";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6237 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6238 my $prev_mm_pos = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6239 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6240 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6241 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6242 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6243 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6244 $prev_mm_pos = pos($tmp); # Position of last mismatch
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6245 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6246 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6247 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6248 return $XX_tag;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6249 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6250
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6251
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6252
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6253 sub print_helpfile{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6254 print << "HOW_TO";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6255
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6256
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6257 This program is free software: you can redistribute it and/or modify
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6258 it under the terms of the GNU General Public License as published by
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6259 the Free Software Foundation, either version 3 of the License, or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6260 (at your option) any later version.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6261
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6262 This program is distributed in the hope that it will be useful,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6263 but WITHOUT ANY WARRANTY; without even the implied warranty of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6264 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6265 GNU General Public License for more details.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6266 You should have received a copy of the GNU General Public License
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6267 along with this program. If not, see <http://www.gnu.org/licenses/>.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6268
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6269
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6270
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6271 DESCRIPTION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6272
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6273
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6274 The following is a brief description of command line options and arguments to control the Bismark
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6275 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6276 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6277 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6278 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6279 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6280 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6281 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6282 sequence from the genome and determine if there were any protected C's present or not.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6283
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6284 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6285 re-enabled by using --non_directional.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6286
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6287 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6288 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6289 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6290
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6291
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6292 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6293
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6294
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6295 ARGUMENTS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6296
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6297 <genome_folder> The path to the folder containing the unmodified reference genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6298 as well as the subfolders created by the Bismark_Genome_Preparation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6299 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6300 Bismark expects one or more fastA files in this folder (file extension: .fa
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6301 or .fasta). The path can be relative or absolute.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6302
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6303 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6304 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6305 correspond file-for-file and read-for-read with those specified in <mates2>.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6306 Reads may be a mix of different lengths. Bismark will produce one mapping result
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6307 and one report file per paired-end input file pair.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6308
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6309 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6310 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6311 correspond file-for-file and read-for-read with those specified in <mates1>.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6312 Reads may be a mix of different lengths.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6313
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6314 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6315 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6316 produce one mapping result and one report file per input file.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6317
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6318
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6319 OPTIONS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6320
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6321
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6322 Input:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6323
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6324 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6325 files (usually having extension .fg or .fastq). This is the default. See also
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6326 --solexa-quals.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6327
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6328 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6329 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6330 are assumed to be 40 on the Phred scale.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6331
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6332 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6333
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6334 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6335
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6336 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6337
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6338 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6339
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6340 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6341 (which can't). The formula for conversion is:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6342 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6343 is usually the right option for use with (unconverted) reads emitted by the GA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6344 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6345
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6346 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6347 reads emitted by GA Pipeline version 1.3 or later. Default: off.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6348
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6349 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6350 specified it is assumed that Bowtie (1 or 2) is in the PATH.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6351
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6352
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6353 Alignment:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6354
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6355 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6356 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6357 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6358
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6359 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6360 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6361 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6362
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6363 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6364 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6365 quality values to the nearest 10 and saturates at 30. This value is not relevant for
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6366 Bowtie 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6367
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6368 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6369 --best mode. Best-first search must keep track of many paths at once to ensure it is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6370 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6371 memory impact of the descriptors, but they can still grow very large in some cases. If
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6372 you receive an error message saying that chunk memory has been exhausted in --best mode,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6373 try adjusting this parameter up to dedicate more memory to the descriptors. This value
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6374 is not relevant for Bowtie 2. Default: 512.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6375
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6376 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6377 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6378 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6379 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6380
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6381 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6382 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6383 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6384 A 61-bp gap would not be valid in that case. Default: 500.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6385
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6386
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6387 Bowtie 1 Reporting:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6388
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6389 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6390 will be used by default.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6391
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6392 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6393 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6394 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6395 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6396 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6397 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6398 the best alignment). --best mode also removes all strand bias. Note that --best does not
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6399 affect which alignments are considered "valid" by Bowtie, only which valid alignments
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6400 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6401 Default: on.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6402
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6403 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6404 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6405
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6406
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6407 Output:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6408
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6409 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6410 bisulfite strands will be reported. Default: OFF.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6411
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6412 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6413 to the original strands are merely theoretical and should not exist in reality. Specifying directional
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6414 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6415 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6416 for sprand-specific libraries).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6417
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6418 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6419 split up into several smaller files to run concurrently and the output files are to be merged.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6420
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6421 --quiet Print nothing besides alignments.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6422
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6423 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6424 of SAM format output.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6425
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6426 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6427 appear as they did in the input, without any translation of quality values that may have
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6428 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6429 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6430 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6431 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6432
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6433 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6434 mismatches or other reads that fail to align uniquely to a file in the output directory.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6435 Written reads will appear as they did in the input, without any of the translation of quality
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6436 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6437 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6438 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6439
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6440 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6441 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6442 to create it first. The path to the output folder can be either relative or absolute.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6443
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6444 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6445 the specified folder does not exist, Bismark will attempt to create it first. The path to the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6446 temporary folder can be either relative or absolute.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6447
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6448
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6449
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6450 Other:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6451
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6452 -h/--help Displays this help file.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6453
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6454 -v/--version Displays version information.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6455
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6456
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6457 BOWTIE 2 SPECIFIC OPTIONS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6458
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6459 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6460 alignments, i.e. searches for alignments involving all read characters (also called
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6461 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6462 and/or quality trimmed where appropriate. Default: off.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6463
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6464 Bowtie 2 alignment options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6465
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6466 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6467 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6468 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6469 Bowtie 1 see -n).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6470
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6471 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6472 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6473 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6474 Bowtie 1 see -l).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6475
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6476 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6477 position to be the highest possible, regardless of the actual value. I.e. input is treated
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6478 as though all quality values are high. This is also the default behavior when the input
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6479 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6480
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6481
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6482 Bowtie 2 paired-end options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6483
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6484 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6485 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6486 and on by default.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6487
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6488 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6489 A discordant alignment is an alignment where both mates align uniquely, but that does not
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6490 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6491 and it is on by default.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6492
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6493
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6494 Bowtie 2 effort options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6495
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6496 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6497 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6498 new second-best alignment. Default: 15.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6499
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6500 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6501 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6502 mismatches allowed) at different offsets and searches for more alignments. A read is considered
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6503 to have repetitive seeds if the total number of seed hits divided by the number of seeds
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6504 that aligned at least once is greater than 300. Default: 2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6505
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6506 Bowtie 2 parallelization options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6507
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6508
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6509 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6510 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6511 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6512 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6513 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6514 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6515 automatically use the option '--reorder', which guarantees that output SAM records are printed in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6516 an order corresponding to the order of the reads in the original input file, even when -p is set
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6517 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6518 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6519 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6520 correspond to input order in that case.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6521
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6522 Bowtie 2 Scoring options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6523
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6524 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6525 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6526 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6527 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6528 L,0,-0.2.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6529
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6530 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6531 of <int1> + N * <int2>. Default: 5, 3.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6532
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6533 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6534 a penalty of <int1> + N * <int2>. Default: 5, 3.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6535
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6536
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6537 Bowtie 2 Reporting options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6538
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6539 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6540 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6541 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6542 effort expended to find valid alignments.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6543
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6544 For reference, this used to be the old (now deprecated) description of -M:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6545 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6546 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6547 happens first. Only the best alignment is reported. Information from the other alignments is used to
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6548 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6549 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6550 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6551 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6552 always used and its default value is set to 10.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6553
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6554
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6555 'VANILLA' Bismark OUTPUT:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6556
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6557 Single-end output format (tab-separated):
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6558
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6559 (1) <seq-ID>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6560 (2) <read alignment strand>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6561 (3) <chromosome>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6562 (4) <start position>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6563 (5) <end position>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6564 (6) <observed bisulfite sequence>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6565 (7) <equivalent genomic sequence>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6566 (8) <methylation call>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6567 (9) <read conversion
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6568 (10) <genome conversion>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6569 (11) <read quality score (Phred33)>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6570
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6571
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6572 Paired-end output format (tab-separated):
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6573 (1) <seq-ID>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6574 (2) <read 1 alignment strand>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6575 (3) <chromosome>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6576 (4) <start position>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6577 (5) <end position>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6578 (6) <observed bisulfite sequence 1>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6579 (7) <equivalent genomic sequence 1>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6580 (8) <methylation call 1>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6581 (9) <observed bisulfite sequence 2>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6582 (10) <equivalent genomic sequence 2>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6583 (11) <methylation call 2>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6584 (12) <read 1 conversion
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6585 (13) <genome conversion>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6586 (14) <read 1 quality score (Phred33)>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6587 (15) <read 2 quality score (Phred33)>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6588
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6589
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6590 Bismark SAM OUTPUT (default):
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6591
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6592 (1) QNAME (seq-ID)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6593 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6594 (3) RNAME (chromosome)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6595 (4) POS (start position)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6596 (5) MAPQ (always 255)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6597 (6) CIGAR
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6598 (7) RNEXT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6599 (8) PNEXT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6600 (9) TLEN
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6601 (10) SEQ
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6602 (11) QUAL (Phred33 scale)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6603 (12) NM-tag (edit distance to the reference)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6604 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6605 (14) XM-tag (methylation call string)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6606 (15) XR-tag (read conversion state for the alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6607 (16) XG-tag (genome conversion state for the alignment)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6608
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6609 Each read of paired-end alignments is written out in a separate line in the above format.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6610
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6611
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6612 This script was last edited on 21 Aug 2012.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6613
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6614 HOW_TO
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6615 }