annotate bismark @ 3:91f07ff056ca draft

Uploaded
author bgruening
date Mon, 14 Apr 2014 16:43:14 -0400
parents 62c6da72dd4a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/perl --
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2 use strict;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3 use warnings;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4 use IO::Handle;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5 use Cwd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6 $|++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7 use Getopt::Long;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
8
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
9
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
11
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
15 ## (at your option) any later version.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
16
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
20 ## GNU General Public License for more details.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
21
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
24
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
25
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
26 my $parent_dir = getcwd;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
27 my $bismark_version = 'v0.10.0';
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
29
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
31 foreach my $arg (@ARGV){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
33 $arg = '--phred64-quals';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
34 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
35 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
37
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
39
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
42 my %counting; # counting various events
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
43
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
44 my $seqID_contains_tabs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
45
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
46 foreach my $filename (@filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
47
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
49 ### resetting the counting hash and fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
50 reset_counters_and_fhs($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
51 $seqID_contains_tabs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
52
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
54 if ($filename =~ ','){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
56
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
61
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
63
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
66
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
67 ### additional variables only for paired-end alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
69
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
70 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
72 warn "Input files are in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
73
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
74 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
77
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
86 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
87 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
90
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
99 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
100
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
101 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
104 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
106 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
109 ### FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
110 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
111 warn "Input files are in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
112 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
113 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
119 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
120 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
121 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
122 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
126 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
127 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
129
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
132 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
133 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
134 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
135 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
138 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
139 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
142
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
145 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
146 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
147 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
148 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
151 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
152 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
153 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
154 elsif($pbat){ # PBAT-Seq
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
158
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
159 $fhs[0]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
160 $fhs[0]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
165 $fhs[3]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
166 $fhs[3]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
167 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
168 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
169 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
172
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
181 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
182 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
183 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
185
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
189 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
191 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
194 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
195 else{ #uncompressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
198
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
208 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
210 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
212 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
213 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
215 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
216 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
218 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
219
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
220 ### Else we are performing SINGLE-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
221 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
223 ### Initialising bisulfite conversion filenames
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
224 my ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
225
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
226
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
227 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
228 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
229 warn "Inut file is in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
230 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
234 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
239
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
240 ### Creating 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
241 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
243 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
244 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
246 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
248
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
249 ## FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
250 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
251 warn "Input file is in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
252 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
255 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
256 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
259 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
260 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
264 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
265
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
267 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
270 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
272 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
273 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
276 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
277
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
279
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
281 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
282
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
283 sub start_methylation_call_procedure_single_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
285 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
287 if ($sequence_file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
290 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
291 $filename = $sequence_file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
292 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
294 ### printing all alignments to a results file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
295 my $outfile = $filename;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
296 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
297 $outfile = "$prefix.$outfile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
298 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
299
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
300
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
301 if ($bowtie2){ # SAM format is the default for Bowtie 2
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
302 $outfile =~ s/$/_bismark_bt2.sam/;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
303 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
305 $outfile =~ s/$/_bismark.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
306 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
307 else{ # SAM is the default output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
308 $outfile =~ s/$/_bismark.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
309 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
310
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
311 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
312
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
313 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
314 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
315 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
316 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
317 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
318 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
319 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
320 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
321 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
322 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
323 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
324
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
325 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
326 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
327
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
328 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
329 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
330 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
331
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
332 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
333 my $reportfile = $filename;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
334 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
335 $reportfile = "$prefix.$reportfile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
336 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
337
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
338 if ($bowtie2){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
341 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
342 $reportfile =~ s/$/_bismark_SE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
343 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
344
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
347
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
348 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
349 my $unmapped_file = $filename;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
350 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
351 $unmapped_file = "$prefix.$unmapped_file";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
352 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
353
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
354 $unmapped_file =~ s/$/_unmapped_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
357 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
358 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
359 my $ambiguous_file = $filename;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
360 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
361 $ambiguous_file = "$prefix.$ambiguous_file";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
362 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
366 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
367
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
368 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
369 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
370 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
371 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
372
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
373
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
374 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
375 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
376 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
377 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
378 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
380
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
381 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
382 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
383 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
384
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
385 ### Input file is in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
386 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
387 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
388 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
389 ### Input file is in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
390 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
391 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
392 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
393 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
394
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
395 sub start_methylation_call_procedure_paired_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
396 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
397
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
398 my ($dir_1,$filename_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
399
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
400 if ($sequence_file_1 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
401 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
402 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
403 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
404 $filename_1 = $sequence_file_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
406
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
407 my ($dir_2,$filename_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
408
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
409 if ($sequence_file_2 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
410 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
411 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
412 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
413 $filename_2 = $sequence_file_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
414 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
415
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
416 ### printing all alignments to a results file
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
417 my $outfile = $filename_1;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
418
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
419 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
420 $outfile = "$prefix.$outfile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
421 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
422
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
423 if ($bowtie2){ # SAM format is the default Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
424 $outfile =~ s/$/_bismark_bt2_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
427 $outfile =~ s/$/_bismark_pe.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
428 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
429 else{ # SAM format is the default Bowtie 1 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
430 $outfile =~ s/$/_bismark_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
431 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
432
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
433 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
434
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
435 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
436 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
437 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
439 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
440 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
441 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
442 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
443 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
444 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
447 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
448 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
449
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
450 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
451 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
452 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
453
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
454 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
455 my $reportfile = $filename_1;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
456 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
457 $reportfile = "$prefix.$reportfile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
458 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
459
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
460 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
462 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
463 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
464 $reportfile =~ s/$/_bismark_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
465 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
466
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
467 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
468 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
469 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
470
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
471
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
472 ### Unmapped read output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
473 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
474 my $unmapped_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
475 my $unmapped_2 = $filename_2;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
476 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
477 $unmapped_1 = "$prefix.$unmapped_1";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
478 $unmapped_2 = "$prefix.$unmapped_2";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
479 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
485 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
486
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
487 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
488 my $amb_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
489 my $amb_2 = $filename_2;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
490 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
491 $amb_1 = "$prefix.$amb_1";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
492 $amb_2 = "$prefix.$amb_2";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
493 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
494
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
500 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
501
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
502 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
503 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
504 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
505
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
506 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
507 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
508 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
509 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
510 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
511 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
512
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
513 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
514 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
516
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
517 ### Input files are in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
518 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
519 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
520 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
521 ### Input files are in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
522 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
523 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
524 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
526
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
527 sub print_final_analysis_report_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
528 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
529 ### All sequences from the original sequence file have been analysed now
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
530 ### deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
531
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
532 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
534 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
535 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
536 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
537 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
538 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
541 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
542 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
543 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
544 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
545 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
546 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
547 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
548 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
549 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
550 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
551 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
552 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
553 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
554 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
555 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
556 warn "Could not delete temporary files properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
557 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
558 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
559
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
560 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
561 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
562 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
563 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
564 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
565 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
566 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
567 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
568
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
569 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
570 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
571 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
572 my $percent_alignable_sequences;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
573
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
574 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
575 $percent_alignable_sequences = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
576 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
577 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
578 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
579 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
580
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
581 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
582 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
583
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
584 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
585 ### only calculating the percentage if there were any overruled alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
586 if ($counting{low_complexity_alignments_overruled_count}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
587 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
588 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
589 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
590
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
591 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
592 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
593 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
594 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
595 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
596
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
597 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
598 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
599 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
600 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
601 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
602
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
603 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
604 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
605 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
606 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
607
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
608 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
609 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
611 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
615 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
617 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
618 warn "\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
619
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
623 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
625 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
626 warn "\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
627
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
630
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
634 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
636 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
637 print REPORT "\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
638
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
642 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
644 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
645 print REPORT "\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
646
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
647 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
650 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
651
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
652 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
653 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
654 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
655 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
656
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
657 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
660 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
661
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
662 my $percent_meC_unknown;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
665 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
666
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
667
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
668 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
669 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
670 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
672 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
673 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
674 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
675 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
676 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
677
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
678 ### printing methylated C percentage (CHG context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
679 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
680 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
681 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
682 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
683 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
684 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
688 ### printing methylated C percentage (CHH context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
689 if ($percent_meCHH){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
690 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
692 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
693 else{
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
696 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
697
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
698 ### printing methylated C percentage (Unknown C context) if applicable
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
699 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
700 if ($percent_meC_unknown){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
703 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
704 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
707 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
708 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
709 print REPORT "\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
710 warn "\n\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
711
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
712 if ($seqID_contains_tabs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
715 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
716
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
717
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
718 ###########################################################################################################################################
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
719 ### create pie-chart with mapping stats
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
720 ###########################################################################################################################################
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
721
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
722
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
723 my $filename;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
724 if ($pbat){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
725 $filename = $G_to_A_infile;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
726 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
727 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
728 $filename = $C_to_T_infile;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
729 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
730
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
732 $pie_chart =~ s/gz$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
733 $pie_chart =~ s/_C_to_T\.fastq$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
734 $pie_chart =~ s/_G_to_A\.fastq$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
735
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
736 # if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
738 # }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
740
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
741
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
742 #Check whether the module GD::Graph is installed
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
743 my $gd_graph_installed = 0;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
744 eval{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
745 require GD::Graph::pie;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
746 GD::Graph::pie->import();
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
747 };
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
748
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
749 unless($@) {
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
750 $gd_graph_installed = 1;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
751 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
752 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
754 sleep(2);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
755 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
756
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
757 if ($gd_graph_installed){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
758 warn "Generating pie chart\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
759 sleep(1);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
760 my $graph = GD::Graph::pie->new(600,600);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
761
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
762 my $percent_unaligned;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
763 my $percent_multiple;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
764 my $percent_unextractable;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
765
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
766 if ($counting{sequences_count}){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
770 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
771 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
773 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
774
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
775 my @aln_stats = (
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
778 );
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
779
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
780 $graph->set(
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
781 start_angle => 180,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
782 '3d' => 0,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
783 label => 'Alignment stats (single-end)',
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
784 suppress_angle => 2, # Only label slices of sufficient size
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
785 transparent => 0,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
786 dclrs => [ qw(red lorange dgreen cyan) ],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
787 ) or die $graph->error;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
788
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
790
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
792 binmode PIE;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
793 print PIE $gd->png;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
794 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
795
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
796 warn "====================\nBismark run complete\n====================\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
797
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
798 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
799
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
800
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
801 sub print_final_analysis_report_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
804 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
805 if ($G_to_A_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
806 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
807 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
808 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
809 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
810 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
811 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
812 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
813 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
814 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
815 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
816 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
817 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
818 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
819 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
820 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
821 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
822 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
823 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
824 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
825 if ($G_to_A_infile_2 and $C_to_T_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
826 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
827 if ($deletion_successful == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
828 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
829 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
830 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
831 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
832 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
833 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
834 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
835 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
836 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
837 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
838 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
839 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
840 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
841 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
842 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
844
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
845 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
846 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
847 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
848 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
849 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
850 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
851 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
852 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
853
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
854 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
855 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
856 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
857
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
858 my $percent_alignable_sequence_pairs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
859 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
860 $percent_alignable_sequence_pairs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
861 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
862 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
863 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
864 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
865 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
866 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
867
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
868 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
869 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
870 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
871 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
872 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
873
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
874
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
875 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
876 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
877 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
878 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
879 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
880 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
881
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
882 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
883 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
884 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
885 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
886
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
887 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
888 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
889
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
891 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
895 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
897 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
898 warn "\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
899
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
903 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
905 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
906 warn "\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
907
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
912 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
914 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
915 print REPORT "\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
916
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
920 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
922 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
923 print REPORT "\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
924
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
925 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
929
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
930 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
931 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
932 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
933 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
934
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
935 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
938 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
939
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
940 my $percent_meC_unknown;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
943 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
944
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
945
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
946 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
947 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
948 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
950 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
951 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
952 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
953 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
954 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
955
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
956 ### printing methylated C percentage in CHG context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
957 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
958 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
959 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
960 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
961 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
962 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
964 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
965
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
966 ### printing methylated C percentage in CHH context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
967 if ($percent_meCHH){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
968 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
970 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
971 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
974 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
975
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
976 ### printing methylated C percentage (Unknown C context) if applicable
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
977 if ($bowtie2){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
978 if ($percent_meC_unknown){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
981 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
982 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
985 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
986 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
987 print REPORT "\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
988 warn "\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
989
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
990
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
991 ############################################################################################################################################
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
992 ### create pie-chart with mapping stats
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
993 ###########################################################################################################################################
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
994
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
995 my $filename;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
996 if ($pbat){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
997 $filename = $G_to_A_infile_1;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
998 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
999 else{
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1000 $filename = $C_to_T_infile_1;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1001 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1002
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1004 $pie_chart =~ s/gz$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1005 $pie_chart =~ s/_C_to_T.fastq$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1006 $pie_chart =~ s/_G_to_A.fastq$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1007 ### special format for gzipped PE Bowtie1 files
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1010
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1011 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1012 # prefix is now being prepended to the temp files already
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1013 # $pie_chart = "$prefix.$pie_chart";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1014 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1016
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1017 #Check whether the module GD::Graph is installed
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1018 my $gd_graph_installed = 0;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1019 eval{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1020 require GD::Graph::pie;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1021 GD::Graph::pie->import();
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1022 };
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1023
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1024 unless($@) {
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1025 $gd_graph_installed = 1;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1026 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1027 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1029 sleep(2);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1030 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1031
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1032 if ($gd_graph_installed){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1033 warn "Generating pie chart\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1034 sleep(1);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1035 my $graph = GD::Graph::pie->new(600,600);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1036
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1037 my $percent_unaligned;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1038 my $percent_multiple;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1039 my $percent_unextractable;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1040
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1041 if ($counting{sequences_count}){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1045 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1046 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1048 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1049
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1050 my @aln_stats = (
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1053 );
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1054
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1055 # push @{$mbias_read1[0]},$pos;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1056
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1057 $graph->set(
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1058 start_angle => 180,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1059 '3d' => 0,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1060 label => 'Alignment stats (paired-end)',
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1061 suppress_angle => 2, # Only label slices of sufficient size
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1062 transparent => 0,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1063 dclrs => [ qw(red lorange dgreen cyan) ],
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1064 ) or die $graph->error;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1065
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1067
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1069 binmode PIE;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1070 print PIE $gd->png;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1071 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1072
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1073 warn "====================\nBismark run complete\n====================\n\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1074
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1075 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1076
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1077 sub process_single_end_fastA_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1079 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1080 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1081 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1082
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1083 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1084 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1085 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1086 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1087 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1088 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1089 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1090
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1091 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1092
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1093 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1094 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1095 # last if ($counting{sequences_count} > 100);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1096 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1097 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1098 last unless ($identifier and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1099
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1100 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1101
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1102 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1103
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1104 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1105 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1106 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1107 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1108 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1109 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1110
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1111 $counting{sequences_count}++;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1112 if ($counting{sequences_count}%1000000==0) {
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1113 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1114 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1115 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1116 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1117
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1118 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1119
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1120 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1121 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1122 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1123 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1124 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1125 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1126 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1127
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1128 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1129 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1130 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1131
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1132 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1133 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1134 print AMBIG ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1135 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1136 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1137
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1138 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1139 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1140 print UNMAPPED ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1141 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1142 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1143 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1144 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1145
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1146 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1147
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1148 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1149
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1150 sub process_single_end_fastQ_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1151 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1152 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1153 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1154 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1156 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1157 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1158 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1159 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1160 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1161 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1162 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1163
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1164 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1165
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1166 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1167 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1168 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1169 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1170 my $identifier_2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1171 my $quality_value = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1172 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1173
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1174 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1175
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1176 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1177
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1178 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1179 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1180 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1181 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1182 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1183 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1184
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1185 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1186
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1187 if ($counting{sequences_count}%1000000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1188 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1189 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1190 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1191 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1192 chomp $quality_value;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1193
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1194 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1195
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1196 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1197 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1198 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1199 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1200 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1201 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1202 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1203
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1204 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1205 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1206 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1207
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1208 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1209 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1210 print AMBIG "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1211 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1212 print AMBIG $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1213 print AMBIG "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1214 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1215
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1216 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1217 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1218 print UNMAPPED "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1219 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1220 print UNMAPPED $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1221 print UNMAPPED "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1222 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1223 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1224 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1225
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1226 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1227
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1228 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1229
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1230 sub process_fastA_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1231 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1232 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1233 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1234 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1235 ### converted genomes (either the C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1236
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1237 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1238 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1239 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1240 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1241 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1242 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1243 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1244 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1245 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1246
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1247 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1248 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1249
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1250 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1251
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1252 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1253 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1254 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1255 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1256 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1257 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1258 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1259 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1260
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1261 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1262 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1263
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1264 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1265
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1266 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1267 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1268 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1269 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1270 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1271 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1272
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1273 $counting{sequences_count}++;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1274 if ($counting{sequences_count}%1000000==0) {
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1275 warn "Processed $counting{sequences_count} sequence pairs so far\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1276 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1277 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1278 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1279
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1280 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1281 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1282 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1283 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1284
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1285 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1287 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1288 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1289 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1290 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1291 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1292 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1293 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1294
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1295 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1296 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1297 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1298
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1299 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1300 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1301 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1302 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1303 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1304 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1307 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1308 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1309 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1310 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1311 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1312 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1313 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1314 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1315
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1316 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1317
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1318 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1319
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1320 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1321
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1322 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1323
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1324 sub process_fastQ_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1325 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1326 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1327 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1328 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1329 ### of the converted genomes (either C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1330
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1331 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1332 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1333 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1334 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1335 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1336 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1337 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1338 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1339 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1340
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1341 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1342
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1343 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1344 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1345 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1346 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1347 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1348 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1349 my $ident_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1350 my $quality_value_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1351 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1352 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1353 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1354 my $ident_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1355 my $quality_value_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1356 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1358 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1359 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1360
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1361 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1362
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1363 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1364 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1365 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1366 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1367 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1369
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1370 $counting{sequences_count}++;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1371 if ($counting{sequences_count}%1000000==0) {
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
1372 warn "Processed $counting{sequences_count} sequence pairs so far\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1374
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1375 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1376 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1377
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1378 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1379 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1380 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1381 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1382 chomp $quality_value_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1383 chomp $quality_value_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1384
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1385 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1386
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1387 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1388 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1389 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1390 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1391 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1392 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1393 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1394
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1395 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1396 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1397 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1398
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1399 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1400 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1401 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1402 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1403 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1404 print AMBIG_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1405 print AMBIG_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1406 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1407 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1408 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1409 print AMBIG_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1410 print AMBIG_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1411 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1412
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1413 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1414 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1415 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1416 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1417 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1418 print UNMAPPED_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1419 print UNMAPPED_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1420 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1421 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1422 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1423 print UNMAPPED_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1424 print UNMAPPED_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1426 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1427
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1428 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1430 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1431
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1432 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1433
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1434 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1435
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1436 sub check_bowtie_results_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1437 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1438
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1439 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1440 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1443 my %mismatches = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1444 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1445 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1447 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1448 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1449 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1450 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1451 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1452 ### STEP I Now processing the alignment stored in last_line ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1453 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1454 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1455 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1456 ### we only continue to extract useful information about this alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1457 if ($valid_alignment_found_1 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1458 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1459 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1460 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1461
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1462 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1463 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1464 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1465
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1466 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1467 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1468 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1469 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1471 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1472 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1473 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1474 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1475 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1476 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1477 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1478 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1479 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1480 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1481 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1482 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1483 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1484 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1485 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1486 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1487 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1488 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1489 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1490 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1491 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1492 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1493 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1494 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1495 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1496 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1497 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1498 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1499 $number_of_mismatches = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1500 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1501 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1502 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1503 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1504 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1505 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1506 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1507 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1508 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1509 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1510 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1511 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1512 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1513 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1514 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1515 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1517 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1518 ### we only continue to extract useful information about this second alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1519 if ($valid_alignment_found_2 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1520 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1521 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1522 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1523 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1524 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1526 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1527
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1528 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1529 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1530 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1531 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1532 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1533 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1534 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1535
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1536 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1537 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1538 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1539 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1541 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1542 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1543 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1544 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1545 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1546 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1547 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1548 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1549 ### extracting the chromosome number from the bowtie output (see above)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1550 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1551 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1552 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1553 ### case we are not writing the same entry out a second time.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1554 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1555 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1556 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1557 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1558 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1559 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1560 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1561 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1562 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1563 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1564 $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1565 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1566 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1567 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1568 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1569 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1570 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1571 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1572 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1573 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1574 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1575 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1576 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1577 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1578 ### still within the 2nd sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1579 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1580 ### still withing the 1st sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1581 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1582 ### still within the if (last_seq_id eq identifier) condition
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1583 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1584 ### still within foreach index loop
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1585 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1586 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1587 unless(%mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1588 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1589 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1590 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1592 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1593 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1594 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1595 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1596 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1597 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1598 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1599 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1600 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1601 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1602 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1603 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1604 my $sequence_fails = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1605 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1606 my $methylation_call_params; # hash reference!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1607 ### sorting in ascending order
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1608 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1609
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1610 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1611 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1612 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1614 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1615 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1616 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1617 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1618 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1619 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1620 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1621 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1622 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1623 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1624 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1625 ### reaction. E.g.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1626 ### CAGTCACGCGCGCGCG will become
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1627 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1628 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1629 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1630 ### G->A conversion:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1631 ### highly methylated: CAATCACACACACACA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1632 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1633 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1634 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1635 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1636 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1637 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1638 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1639 ### In the above example the number of transliterations required to transform the actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1640 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1641 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1642 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1643 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1644 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1645 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1646 my @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1647 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1648 my $transliterations_performed;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1649 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1650 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1651 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1652 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1653 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1654 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1655 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1656 die "unexpected index number range $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1657 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1658 push @three_candidate_seqs,{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1659 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1660 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1661 mismatch_number => $mismatch_number,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1662 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1663 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1664 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1665 transliterations_performed => $transliterations_performed,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1666 };
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1667 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1668 ### sorting in ascending order for the lowest number of transliterations performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1669 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1670 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1671 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1672 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1673 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1674 if (($first_array_element*2) < $second_array_element){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1675 $counting{low_complexity_alignments_overruled_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1676 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1677 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1678 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1679 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1680 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1681 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1682 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1683 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1684 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1685 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1687 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1688 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1689 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1690 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1691 ### after processing the alignment with the lowest number of mismatches we exit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1692 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1693 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1694 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1695 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1696 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1697 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1698 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1699 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1700 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1701 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1702 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1703 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1704 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1705 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1707
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1708 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1709 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1710 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1711 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1712 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1713 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1714 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1715 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1716 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1717 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1718
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1719 ### If the sequence has not been rejected so far it will have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1720 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1721 if ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1722 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1723 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1724 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1725 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1726 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1727
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1728 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1729 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1730 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1731 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1732 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1733 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1734
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1735 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1736 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1737
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1738 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1739 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1740 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1741
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1742 sub check_bowtie_results_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1743 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1744
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1745
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1746 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1747 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1748 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1749
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1750 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1751 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1752 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1753
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1754 my $alignment_ambiguous = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1755
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1756 my %alignments = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1757
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1758 ### reading from the Bowtie 2 output filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1759 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1760 # print "Index: $index\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1761 # print "$fhs[$index]->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1762 # print "$fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1763 # sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1764 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1765 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1766
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1767 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1768 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1769
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1770 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1771 # SAM format specifications for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1772 # (1) Name of read that aligned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1773 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1774 # 1 The read is one of a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1775 # 2 The alignment is one end of a proper paired-end alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1776 # 4 The read has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1777 # 8 The read is one of a pair and has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1778 # 16 The alignment is to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1779 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1780 # 64 The read is mate 1 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1781 # 128 The read is mate 2 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1782 # 256 The read has multiple mapping states
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1783 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1784 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1785 # (5) Mapping quality (255 means MAPQ is not available)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1786 # (6) CIGAR string representation of alignment (* if unavailable)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1787 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1788 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1789 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1790 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1791 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1792 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1793 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1794 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1795 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1796 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1797 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1798 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1799 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1800 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1801 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1802 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1803
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1804 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1805
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1806 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1807 if ($flag == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1808 ## reading in the next alignment, which must be the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1809 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1810 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1811 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1812 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1813 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1814 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1815 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1816 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1817 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1818 next; # next instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1819 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1820 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1821 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1822 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1823 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1824 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1825 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1826 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1827
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1828 # if there are one or more proper alignments we can extract the chromosome number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1829 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1830 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1831 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1832 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1833 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1834 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1835 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1836
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1837 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1838 my ($alignment_score,$second_best,$MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1839 my @fields = split (/\t/,$fhs[$index]->{last_line});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1840
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1841 foreach (11..$#fields){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1842 if ($fields[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1843 $alignment_score = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1844 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1845 elsif ($fields[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1846 $second_best = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1847 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1848 elsif ($fields[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1849 $MD_tag = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1850 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1851 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1852
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1853 # warn "First best alignment_score is: '$alignment_score'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1854 # warn "MD tag is: '$MD_tag'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1855 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1856
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1857 if (defined $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1858 # warn "second best alignment_score is: '$second_best'\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1860 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1861 if ($alignment_score == $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1862 $alignment_ambiguous = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1863 ## need to read and discard all additional ambiguous reads until we reach the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1864 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1865 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1866 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1867 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1868 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1869 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1870 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1871 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1872 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1873 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1874 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1875 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1876 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1877 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1878 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1879 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1880 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1881 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1882
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1883 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1884
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1885 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1886 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1887 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1888 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1889
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1890 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1891 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1892 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1893 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1894 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1895 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1896 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1897 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1898 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1899 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1900
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1901 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1902 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1903 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1904 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1905 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1906 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1907 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1908 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1909 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1910 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1911 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1912 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1913 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1914 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1915 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1916 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1917 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1918 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1919 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1920 else{ # there is no second best hit, so we can just store this one and read in the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1921
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1922 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1923
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1924 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1925 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1926 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1927 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1928
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1929 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1930 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1931 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1932 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1933 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1934 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1935 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1936 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1937 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1938 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1939
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1940 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1941 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1942 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1943 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1944 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1945 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1946 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1947 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1948 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1949 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1950 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1951 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1952 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1953 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1954 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1955 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1956 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1957 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1958
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1959 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1960 if ($alignment_ambiguous == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1961 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1962 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1963 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1964 # print "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1965
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1966 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1967 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1968 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1969 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1970 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1971 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1972 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1973 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1974 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1975 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1976
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1977 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1978 unless(%alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1979 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1980 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1981 # print "$unmapped_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1982 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1983 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1984 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1985 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1986 return 0; # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1987 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1988 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1989
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1990 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1991
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1992 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1993 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1994 ### alignment score we are discarding the sequence altogether.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1995 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1996 ### opening (5) and extending (3 per bp) the gap.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1997
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1998 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1999
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2000 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2001 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2002
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2003 ### print contents of %alignments for debugging
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2004 # if (scalar keys %alignments > 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2005 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2006 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2007 # print "Loc: $alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2008 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2009 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2010 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2011 # print "Index $alignments{$alignment_location}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2012 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2013 # print "pos: $alignments{$alignment_location}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2014 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2015 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2016 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2017 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2018
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2019 ### if there is only 1 entry in the hash with we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2020 if (scalar keys %alignments == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2021 for my $unique_best_alignment (keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2022 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2023 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2024 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2025 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2026 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2027 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2028 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2029 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2030 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2031
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2032 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2033 ### we boot the sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2034 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2035 my $best_alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2036 my $best_alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2037 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2038 # print "$alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2039 unless (defined $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2040 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2041 $best_alignment_location = $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2042 # print "setting best alignment score: $best_alignment_score\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2043 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2044 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2045 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2046 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2047 # warn "Same alignment score, the sequence will get booted!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2048 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2049 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2051 ### else we are going to store the best alignment for further processing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2052 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2053 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2054 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2055 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2056 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2057 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2058 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2059 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2060 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2061 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2062 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2063 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2064 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2065 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2066 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2067 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2068
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2069 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2070 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2071 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2072
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2073 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2074 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2075 # print OUT "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2076
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2077 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2078 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2079 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2080 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2081 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2082 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2083 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2084 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2085 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2086 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2087
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2088 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2089 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2090 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2091 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2092 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2093 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2094 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2095 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2096 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2097 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2098
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2099 ### If the sequence has not been rejected so far it has a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2100 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2101
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2102 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2103 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2104
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2105 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2106 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2107 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2108 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2109 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2110 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2111
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2112
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2113 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2114 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2115 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2116 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2117 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2118
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2119
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2120 sub determine_number_of_transliterations_performed{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2121 my ($sequence,$read_conversion) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2122 my $number_of_transliterations;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2123 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2124 $number_of_transliterations = $sequence =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2126 elsif ($read_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2127 $number_of_transliterations = $sequence =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2128 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2129 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2130 die "Read conversion mode of the read was not specified $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2131 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2132 return $number_of_transliterations;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2133 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2135 sub decide_whether_single_end_alignment_is_valid{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2136 my ($index,$identifier) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2137
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2138 # extracting from Bowtie 1 format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2139 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2140
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2141 ### ensuring that the entry is the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2142 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2143 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2144 ### sensible alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2145 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2146 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2147 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2148 return 1; ### 1st possibility for a sequence to pass
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2149 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2150 ### If the alignment was in the wrong orientation we need to read in a new line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2151 elsif($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2152 my $newline = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2153 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2154 ($id,$strand) = (split (/\t/,$newline))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2156 ### ensuring that the next entry is still the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2157 if ($id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2158 ### checking orientation again
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2159 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2160 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2161 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2162 $fhs[$index]->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2163 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2164 return 1; ### 2nd possibility for a sequence to pass
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2165 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2166 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2167 elsif ($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2168 $newline = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2169 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2170 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2171 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2172 ### the same fields of the just read next entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2173 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2174 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2175 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2176 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2177 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2178 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2179 # assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2180 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2181 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2182 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2183 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2184 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2185 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2186 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2187 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2188 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2189 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2190 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2191 $fhs[$index]->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2192 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2193 return 0; # processing the new alignment result only in the next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2194 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2195 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2196 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2197 # assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2198 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2199 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2200 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2201 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2202 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2203 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2204 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2205 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2206 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2207 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2208 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2209 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2210 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2211 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2212 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2213 ### BOWTIE 1 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2214 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2215
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2216 sub check_bowtie_results_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2217 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2218
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2219 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2220 unless ($quality_value_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2221 $quality_value_1 = 'I'x(length$sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2222 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2223 unless ($quality_value_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2224 $quality_value_2 = 'I'x(length$sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2225 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2226
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2227 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2228 # sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2229 my %mismatches = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2230 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2231
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2232
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2233 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2234 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2235 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2236 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2237 ### strands are not being reported by specifying --directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2238
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2239 foreach my $index (0,3,1,2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2240 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2241 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2242 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2243 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2244 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2245
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2246 ##################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2247 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2248 ##################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2249 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2250 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2251 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2252 if ($valid_alignment_found == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2253 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2254 ### we store the useful information in %mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2255 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2256 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2257 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2258 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2259
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2260 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2261 my ($chromosome_1,$chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2262 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2263 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2264 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2265 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2266 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2267 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2268 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2269 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2270 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2271 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2272 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2273 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2274
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2275 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2276 my $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2277 my $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2278 if ($mismatch_info_1 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2279 $number_of_mismatches_1 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2281 elsif ($mismatch_info_1 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2282 my @mismatches = split (/,/,$mismatch_info_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2283 $number_of_mismatches_1 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2284 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2285 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2286 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2287 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2288 if ($mismatch_info_2 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2289 $number_of_mismatches_2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2290 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2291 elsif ($mismatch_info_2 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2292 my @mismatches = split (/,/,$mismatch_info_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2293 $number_of_mismatches_2 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2294 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2295 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2296 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2297 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2298 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2299 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2300 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2301 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2302 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2303 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2304 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2305 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2306 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2307 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2308 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2309 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2310 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2311 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2312 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2313 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2314 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2315 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2316 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2317 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2318 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2319 ###################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2320 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2321 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2322 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2323 ### this round ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2324 ###################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2325 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2326 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2327
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2328 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2329 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2330 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2331
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2332 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2333 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2334 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2335 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2336 $fhs[$index]->{last_seq_id} = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2337 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2338 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2339 die "Either read 1 or read 2 needs to end on '/1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2341
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2342 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2343 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2344 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2345 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2346 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2347 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2348 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2349 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2350 next; # jumping to the next index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2351 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2352 ### Now processing the entry we just stored in last_line_1 and last_line_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2353 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2354 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2355 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2356 if ($valid_alignment_found == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2357 ### we store the useful information in %mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2358 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2359 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2360 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2361 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2362 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2363 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2364 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2365 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2366 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2367 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2369 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2370 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2371 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2372 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2373 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2374 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2375
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2376 $number_of_mismatches_1='';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2377 $number_of_mismatches_2='';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2378 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2379 if ($mismatch_info_1 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2380 $number_of_mismatches_1 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2381 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2382 elsif ($mismatch_info_1 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2383 my @mismatches = split (/,/,$mismatch_info_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2384 $number_of_mismatches_1 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2385 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2386 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2387 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2388 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2389 if ($mismatch_info_2 eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2390 $number_of_mismatches_2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2391 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2392 elsif ($mismatch_info_2 =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2393 my @mismatches = split (/,/,$mismatch_info_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2394 $number_of_mismatches_2 = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2395 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2396 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2397 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2398 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2399 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2400 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2401 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2402 die "position 1 is greater than position 2" if ($position_1 > $position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2403 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2404 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2405 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2406 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2407 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2408 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2409 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2410 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2411 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2412 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2413 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2414 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2415 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2416 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2417 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2418 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2419 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2420 ###############################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2421 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2422 ###############################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2423 $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2424 $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2425
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2426 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2427 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2428 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2430 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2431 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2433 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2434 $fhs[$index]->{last_seq_id} = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2435 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2436 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2437 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2439 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2440 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2441 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2442 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2443 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2444 next; # jumping to the next index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2446 ### within the 2nd sequence pair alignment in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2447 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2448 ### within the 1st sequence pair alignment in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2449 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2450 ### still within the (last_seq_id eq identifier) condition
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2451 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2452 ### still within foreach index loop
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2453 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2454 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2455 unless(%mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2456 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2457 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2458 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2459 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2460 my $sequence_pair_fails = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2461 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2462 my $methylation_call_params; # hash reference!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2463 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2464 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2465 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2466 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2467 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2468 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2469 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2470 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2471 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2472 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2473 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2474 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2475 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2476 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2477 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2478 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2479 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2480 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2481 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2482 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2483 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2484 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2485 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2486 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2487 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2488 $sequence_pair_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2489 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2490 ### after processing the alignment with the lowest number of mismatches we exit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2491 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2492 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2493 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2494 if ($sequence_pair_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2495 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2496 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2497 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2498 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2499 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2500 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2501 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2502 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2503 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2504 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2505 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2506
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2507 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2508 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2509 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2510 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2511 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2512 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2513 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2514 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2517
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2518 ### If the sequence has not been rejected so far it does have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2519 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2520 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2521
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2522 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2523 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2524 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2525 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2526 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2527 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2528 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2529 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2530 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2531 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2532 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2533
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2534 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2535 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2536 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2537
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2538 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2539 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2541
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2542 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2543 ### BOWTIE 2 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2544 #########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2545
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2546 sub check_bowtie_results_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2547 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2548
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2549 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2550 unless ($quality_value_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2551 $quality_value_1 = 'I'x(length$sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2552 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2553
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2554 unless ($quality_value_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2555 $quality_value_2 = 'I'x(length$sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2556 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2557
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2558
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2559 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2560
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2561
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2562 my %alignments;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2563 my $alignment_ambiguous = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2564
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2565 ### reading from the Bowtie 2 output filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2566
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2567 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2568 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2569 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2570 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2571 ### strands are not being reported when '--directional' is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2572
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2573 foreach my $index (0,3,1,2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2574 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2575 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2576
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2577 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2578 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2579
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2580 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2581 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2582 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2583 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2584 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2585 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2586 $id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2587 $id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2588
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2589 # SAM format specifications for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2590 # (1) Name of read that aligned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2591 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2592 # 1 The read is one of a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2593 # 2 The alignment is one end of a proper paired-end alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2594 # 4 The read has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2595 # 8 The read is one of a pair and has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2596 # 16 The alignment is to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2597 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2598 # 64 The read is mate 1 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2599 # 128 The read is mate 2 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2600 # 256 The read has multiple mapping states
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2601 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2602 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2603 # (5) Mapping quality (255 means MAPQ is not available)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2604 # (6) CIGAR string representation of alignment (* if unavailable)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2605 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2606 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2607 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2608 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2609 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2610 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2611 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2612 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2613 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2614 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2615 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2616 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2617 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2618 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2619 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2620 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2621
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2622 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2623 ### We can store the next alignment and move on to the next Bowtie 2 instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2624 if ($flag_1 == 77 and $flag_2 == 141){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2625 ## reading in the next alignment, which must be the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2626 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2627 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2628
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2629 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2630 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2631 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2632 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2633 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2634 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2635 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2636 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2637 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2638 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2639
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2640 # print "current sequence ($identifier) did not map, reading in next sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2641 # print "$index\t$fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2642 # print "$index\t$fhs[$index]->{last_line_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2643 # print "$index\t$fhs[$index]->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2644 next; # next instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2645 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2646 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2647 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2648 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2649 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2650 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2651 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2652 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2653 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2654
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2655 ### If there are one or more proper alignments we can extract the chromosome number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2656 my ($chromosome_1,$chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2657 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2658 $chromosome_1 = $mapped_chromosome_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2659 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2660 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2661 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2662 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2663 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2664 $chromosome_2 = $mapped_chromosome_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2665 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2666 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2667 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2668 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2669
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2670 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2671
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2672 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2673 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2674
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2675 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2676 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2677
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2678 foreach (11..$#fields_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2679 if ($fields_1[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2680 $alignment_score_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2681 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2682 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2683 $second_best_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2684 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2685 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2686 $MD_tag_1 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2687 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2688 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2689
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2690 foreach (11..$#fields_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2691 if ($fields_2[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2692 $alignment_score_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2693 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2694 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2695 $second_best_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2697 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2698 $MD_tag_2 = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2699 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2700 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2701
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2702 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2703 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2704
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2705 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2706 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2707 # warn "MD tag 1 is: '$MD_tag_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2708 # warn "MD tag 2 is: '$MD_tag_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2709
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2710 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2711 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2712 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2713
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2714 if (defined $second_best_1 and defined $second_best_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2715 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2716 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2717 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2718 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2719
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2720 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2721 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2722 $alignment_ambiguous = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2723 # print "This read will be chucked (AS==XS detected)!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2724
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2725 ## need to read and discard all additional ambiguous reads until we reach the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2726 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2727 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2728 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2729 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2730 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2731 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2732 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2733 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2734 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2735 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2736 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2737
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2738 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2739 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2740 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2741 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2742 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2743 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2744 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2745 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2746 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2747 last; # break free if the end of the alignment output was reached
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2748 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2749 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2750 # if ($fhs[$index]->{last_seq_id}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2751 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2752 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2753 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2754 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2755
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2756 my $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2757 if ($position_1 <= $position_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2758 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2759 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2760 elsif($position_2 < $position_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2761 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2762 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2763
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2764 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2765 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2766 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2767 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2768
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2769 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2770 $alignments{$alignment_location}->{seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2771 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2772 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2773 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2774 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2775 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2776 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2777 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2778 $alignments{$alignment_location}->{position_1} = $position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2779 $alignments{$alignment_location}->{position_2} = $position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2780 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2781 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2782 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2783 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2784 $alignments{$alignment_location}->{flag_1} = $flag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2785 $alignments{$alignment_location}->{flag_2} = $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2786 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2787 # warn "added best of several alignments to \%alignments hash\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2788
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2789 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2790 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2791 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2792 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2793 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2794 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2795 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2796 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2797 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2798 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2799 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2800 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2801
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2802 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2803 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2804 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2805 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2806 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2807 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2808 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2809 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2810 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2811 last; # break free if the end of the alignment output was reached
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2812 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2813 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2814 # if($fhs[$index]->{last_seq_id}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2815 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2816 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2817 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2818 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2819 else{ # there is no second best hit, so we can just store this one and read in the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2820
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2821 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2822 # print "$alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2823 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2824 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2825 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2826 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2827
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2828 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2829 $alignments{$alignment_location}->{seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2830 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2831 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2832 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2833 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2834 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2835 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2836 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2837 $alignments{$alignment_location}->{position_1} = $position_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2838 $alignments{$alignment_location}->{position_2} = $position_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2839 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2840 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2841 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2842 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2843 $alignments{$alignment_location}->{flag_1} = $flag_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2844 $alignments{$alignment_location}->{flag_2} = $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2845 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2846
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2847 # warn "added unique alignment to \%alignments hash\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2848
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2849 # Now reading and storing the next read pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2850 my $newline_1 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2851 my $newline_2 = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2852 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2853 chomp $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2854 chomp $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2855 # print "$newline_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2856 # print "$newline_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2857 my ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2858 my ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2859 $seq_id_1 =~ s/\/1$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2860 $seq_id_2 =~ s/\/2$//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2861 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2862
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2863 $fhs[$index]->{last_seq_id} = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2864 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2865 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2866
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2867 if ($seq_id_1 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2868 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2869 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2871 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2872 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2873 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2874 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2875 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2876 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2877 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2878 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2879 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2880
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2881 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2882 if ($alignment_ambiguous == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2883 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2884 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2885 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2886 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2887 # print "$ambiguous_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2888 # print "$ambiguous_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2889
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2890 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2891 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2892 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2893 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2894 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2895 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2896 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2897 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2898 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2899 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2900
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2901 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2902 unless (%alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2903 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2904
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2905 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2906 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2907 # print "$unmapped_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2908 # print "$unmapped_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2909 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2910 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2911 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2912 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2913 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2915 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2916
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2917 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2918
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2919 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2920 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2921 ### alignment score we are discarding the sequence pair altogether.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2922 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2923 ### and extending (3 per bp) the gap.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2924
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2925 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2926
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2927 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2928 my $methylation_call_params; # hash reference
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2929 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2930
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2931 ### print contents of %alignments for debugging
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2932 ## if (scalar keys %alignments >= 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2933 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2934 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2935 # print "Loc: $alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2936 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2937 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2938 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2939 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2940 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2941 # print "Index $alignments{$alignment_location}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2942 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2943 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2944 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2945 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2946 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2947 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2948 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2949 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2950 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2951 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2952 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2953 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2954
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2955 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2956 if (scalar keys %alignments == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2957 for my $unique_best_alignment (keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2958 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2959 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2960 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2961 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2962 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2963 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2964 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2965 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2966 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2967 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2968 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2969 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2970 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2971 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2972 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2973 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2974 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2975
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2976 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2977 ### we boot the sequence pair altogether)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2978 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2979 my $best_sum_of_alignment_scores;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2980 my $best_alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2981 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2982 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2983 unless (defined $best_sum_of_alignment_scores){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2984 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2985 $best_alignment_location = $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2986 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2987 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2988 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2989 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2990 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2991 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2992 $sequence_pair_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2993 last; # exiting since we know that the sequence has ambiguous alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2994 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2995 ### else we are going to store the best alignment for further processing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2996 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2997 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2998 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2999 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3000 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3001 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3002 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3003 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3004 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3005 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3006 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3007 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3008 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3009 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3010 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3011 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3012 last; # exiting since the sequence produced a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3013 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3014 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3015 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3016 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3017 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3018 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3019 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3020
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3021 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3022 if ($sequence_pair_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3023 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3024
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3025 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3026 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3027 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3028 # print "$ambiguous_read_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3029 # print "$ambiguous_read_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3030
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3031 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3032 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3033 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3034 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3035 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3036 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3037 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3038 return 0; # => exits to next sequence pair (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3039 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3040 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3041
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3042 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3043 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3044 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3045 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3046 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3047 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3048 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3049 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3051 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3052
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3053 ### If the sequence pair has not been rejected so far it does have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3054 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3056
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3060 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3061 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3062 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3065 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3066 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3067 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3068
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3069 ### now we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3070 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3071 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3072 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3073 # print " $sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3074 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3075 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3076
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3077 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3078 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3079 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3080
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3081 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3082
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3083 sub decide_whether_paired_end_alignment_is_valid{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3084 my ($index,$identifier) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3085 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3086 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3087 chomp $mismatch_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3088 chomp $mismatch_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3089 my $seq_id_1 = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3090 my $seq_id_2 = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3091 $seq_id_1 =~ s/\/1$//; # removing the read /1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3092 $seq_id_2 =~ s/\/1$//; # removing the read /1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3093
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3094 ### ensuring that the current entry is the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3095 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3096 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3097 ### sensible alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3098 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3099 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3100 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3101 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3102 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3103 ### If the alignment was in the wrong orientation we need to read in two new lines
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3104 elsif($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3105 my $newline_1 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3106 my $newline_2 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3107 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3108 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3109 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3110 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3111
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3112 my $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3113 $seq_id_1 = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3114 $seq_id_2 = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3115 # we need to capture the first read (ending on /1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3116 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3117 $seqid = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3118 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3119 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3120 $seqid = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3121 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3122 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3123 die "One of the two reads needs to end on /1!!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3124 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3125
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3126 ### ensuring that the next entry is still the correct sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3127 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3128 ### checking orientation again
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3129 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3130 ### If the orientation was correct can we move on
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3131 if ($orientation == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3132 ### Writing the current sequence to last_line_1 and last_line_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3133 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3134 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3135 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3136 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3137 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3138 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3139 ### the next entry)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3140 elsif ($orientation == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3141 $newline_1 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3142 $newline_2 = $fhs[$index]->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3143 if ($newline_1 and $newline_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3144 ($seq_id_1) = split (/\t/,$newline_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3145 ($seq_id_2) = split (/\t/,$newline_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3146
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3147 $seqid = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3148 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3149 $seqid = $seq_id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3150 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3151 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3152 $seqid = $seq_id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3153 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3154 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3155 die "One of the two reads needs to end on /1!!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3156 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3157
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3158 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3159 ### the same fields of the just read next entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3160 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3161 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3162 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3163 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3164 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3165 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3166 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3167 ### assigning undef to last_seq_id and last_line (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3168 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3169 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3170 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3171 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3172 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3173 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3174 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3175 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3176 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3177 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3178 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3179 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3180 $fhs[$index]->{last_seq_id} = $seqid;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3181 $fhs[$index]->{last_line_1} = $newline_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3182 $fhs[$index]->{last_line_2} = $newline_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3183 return 0; # processing the new alignment result only in the next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3184 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3185 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3186 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3187 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3188 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3189 $fhs[$index]->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3190 $fhs[$index]->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3191 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3193 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3194 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3195 die "The orientation of the alignment must be either correct or incorrect\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3196 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3197 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3198 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3199 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3200 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3201 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3202 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3203
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3204 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3205
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3206 sub extract_corresponding_genomic_sequence_paired_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3207 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3208 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3209 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3210 my $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3211 my $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3212 my $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3213 my $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3214 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3215
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3216 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3217 ### if the C happens to be at the first or last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3218 my $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3219 my $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3221 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3222 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3223 ### sequences around!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3224 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3225 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3226 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3227 $counting{CT_GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3228 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3229 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3230 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3231 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3232 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3233 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3234 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3235
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3236 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3237
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3238 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3239 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3240 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3241
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3242 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3243 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3244 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3245 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3246 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3247 $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3248 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3249 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3250
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3251 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3252 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3253 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3254 $counting{GA_CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3255 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3256 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3257 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3258 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3259 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3260
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3261 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3262 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3263 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3264 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3265 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3266 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3267 $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3268 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3269
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3270 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3271 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3272 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3273 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3274 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3276
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3277 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3278 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3279 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3280 $counting{GA_CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3281 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3282 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3283 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3284 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3285 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3287 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3288 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3289 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3290 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3291 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3292 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3294 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3295 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3296 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3297 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3298 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3300 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3301 $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3302 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3303 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3304
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3305 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3306 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3307 ### [Index 3, sequence originated from the (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3308 $counting{CT_GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3309 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3310 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3311 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3312 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3313 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3314
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3315 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3316 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3317 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3318 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3319 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3320 ### the reverse strand sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3321 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3322 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3323 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3324 $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3325 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3326
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3327 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3328 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3329 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3330 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3331 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3332 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3333 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3334 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3335 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3336 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3337
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3338 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3339 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3340 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3341 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3342 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3343 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3344 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3345 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3346
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3347 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3348
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3349 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3350 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3351 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3352 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3353
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3359 # sleep(10);
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3362
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3365 my $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3366 my $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3367 my $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3368 my $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3369 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3370
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3371 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3372 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3373 my $non_bisulfite_sequence_1 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3374 my $non_bisulfite_sequence_2 = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3375
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3376 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3377 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3378 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3379
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3380 # parsing CIGAR 1 string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3381 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3382 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3383 shift @ops_1; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3384 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3385 # parsing CIGAR 2 string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3386 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3387 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3388 shift @ops_2; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3390
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3392 my $indels_2 = 0;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3393
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3394 ### Extracting read 1 genomic sequence ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3395
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3396 # extracting 2 additional bp at the 5' end (read 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3399 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3400 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3401 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3402 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3403 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3404 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3405
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3406 foreach (0..$#len_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3407 if ($ops_1[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3408 # extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3409 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3410 # warn "$non_bisulfite_sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3411 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3412 $pos_1 += $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3413 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3414 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3415 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3416 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3417 # warn "$non_bisulfite_sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3418 # position doesn't need adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3419 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3420 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3421 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3422 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3423 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3424 $pos_1 += $len_1[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3425 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3426 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3427 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3428 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3429 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3430 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3431 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3433 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3434
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3435 ### 3' end of read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3436 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3440 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3441 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3442
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3444 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3445
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3447 ### Extracting read 2 genomic sequence ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3448
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3449 ### 5' end of read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3450 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3451 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3452 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3453 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3454 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3455 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3456 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3457 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3458
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3459 foreach (0..$#len_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3460 if ($ops_2[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3461 # extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3462 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3463 # warn "$non_bisulfite_sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3464 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3465 $pos_2 += $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3466 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3467 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3468 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3469 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3470 # warn "$non_bisulfite_sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3471 # position doesn't need adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3472 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3473 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3474 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3475 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3476 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3477 $pos_2 += $len_2[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3478 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3479 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3480 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3481 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3482 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3483 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3484 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3485 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3486 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3487
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3488 ### 3' end of read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3492 # need to set read 1 as well now to prevent warning
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3494 # sleep(5);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3497 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3498 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3500 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3501
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3502 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3503 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3504
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3505 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3506 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3507 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3508 $counting{CT_GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3509 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3510 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3511 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3512 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3513 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3514 ### Read 1 is always the forward hit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3515 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3516 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3517 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3518
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3519 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3520 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3521 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3522 $counting{GA_CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3523 $alignment_read_1 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3524 $alignment_read_2 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3525 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3526 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3527 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3528 ### Read 1 is always the forward hit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3529 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3530 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3531 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3532
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3533 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3534 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3535 ### [Index 2, sequence originated from the complementary to (converted) top strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3536 $counting{GA_CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3537 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3538 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3539 $read_conversion_info_1 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3540 $read_conversion_info_2 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3541 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3542
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3543 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3544 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3545 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3546
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3547 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3548 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3549 ### [Index 3, sequence originated from the (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3550 $counting{CT_GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3551 $alignment_read_1 = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3552 $alignment_read_2 = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3553 $read_conversion_info_1 = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3554 $read_conversion_info_2 = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3555 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3556 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3557 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3558 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3559 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3560 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3561 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3562 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3563 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3564
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3565 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3566 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3567 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3568 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3569 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3570 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3571 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3572 ## the end position of a read is stored in $pos
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3573 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3574 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3575 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3576 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3577 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3578
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3579 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3580 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3581 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3582
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3583 sub print_bisulfite_mapping_result_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3584 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3585
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3586 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3587 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3588 $quality_value = convert_phred64_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3589 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3590 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3591 $quality_value = convert_solexa_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3592 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3593
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3594 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3595 $methylation_call_params->{$identifier}->{position} += 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3596
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3597 ### writing every uniquely mapped read and its methylation call to the output file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3598 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3599 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3600 print OUT "$bowtie1_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3601 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3602 else{ # SAM output, default since Bismark v1.0.0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3603 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3604 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3605 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3606
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3607 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3608 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3609 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3610
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3611 sub print_bisulfite_mapping_result_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3612 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3613
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3614 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3615 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3616 $quality_value = convert_phred64_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3617 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3618 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3619 $quality_value = convert_solexa_quals_to_phred33($quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3620 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3621
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3622 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3623 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3624 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3625
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3626 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3627 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3628 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3629
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3630 sub print_bisulfite_mapping_results_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3631 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3632
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3633 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3634 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3635 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3636 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3637 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3638 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3639 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3640 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3641 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3642
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3643 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3644 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3645
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3646 ### writing every single aligned read and its methylation call to the output file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3647 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3648 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3649 print OUT "$bowtie1_output_paired_end\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3650 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3651 else{ # SAM output, default since Bismark v1.0.0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3652 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3653 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3654
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3655 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3656
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3657 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3658 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3659 ##########################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3660
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3661 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3662 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3663
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3664 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3665 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3666 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3667 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3668 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3669 elsif ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3670 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3671 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3672 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3673
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3674 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3675 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3676
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3678
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3679
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3680 sub convert_phred64_quals_to_phred33{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3681
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3682 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3683 my @quals = split (//,$qual);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3684 my @new_quals;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3685
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3686 foreach my $index (0..$#quals){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3687 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3688 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3689 $new_quals[$index] = $phred33_quality_string;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3690 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3691
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3692 my $phred33_quality = join ("",@new_quals);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3693 return $phred33_quality;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3694 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3695
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3696 sub convert_solexa_quals_to_phred33{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3697
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3698 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3699 my @quals = split (//,$qual);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3700 my @new_quals;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3701
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3702 foreach my $index (0..$#quals){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3703 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3704 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3705 $new_quals[$index] = $phred33_quality_string;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3707
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3708 my $phred33_quality = join ("",@new_quals);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3709 return $phred33_quality;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3710 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3711
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3712 sub convert_phred_score_into_phred33_quality_string{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3713 my $qual = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3714 $qual = chr($qual+33);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3715 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3716 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3717
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3718 sub convert_phred64_quality_string_into_phred_score{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3719 my $string = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3720 my $qual = ord($string)-64;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3721 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3722 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3723
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3724 sub convert_solexa_pre1_3_quality_string_into_phred_score{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3725 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3726 my $string = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3727 my $qual = ord($string)-59;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3728 return $qual;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3729 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3730
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3731
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3732 sub extract_corresponding_genomic_sequence_single_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3733 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3734 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3735 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3736
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3737 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3738 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3739 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3740 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3741 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3742 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3743 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3744 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3745 my $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3746 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3747
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3748 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3749 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3750 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3751 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3752 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3753 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3754 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3755
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3756 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3757 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3758 ### + 2 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3759 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3760 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3761 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3762 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3763 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3764 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3765
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3766 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3767 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3768 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3769 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3770 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3771 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3772 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3773
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3774 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3775 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3776 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3777 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3778 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3779 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3780 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3781 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3782 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3783 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3784 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3785
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3786 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3787 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3788 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3789 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3790 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3791 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3792 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3793
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3794 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3795 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3796 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3797 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3798 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3799 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3800 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3801 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3802 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3803 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3804 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3805
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3806 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3807 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3808 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3809 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3810 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3811 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3812 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3813
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3814 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3815 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3816 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3817 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3818 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3819 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3820 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3821 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3822 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3823 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3824 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3825 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3826
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3827 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3828 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3829 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3830 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3831
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3832 ### at this point we can also determine the end position of a read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3833 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3834 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3835
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3836 sub extract_corresponding_genomic_sequence_single_end_pbat {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3837 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3838 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3839 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3840
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3841 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3842 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3843 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3844 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3845 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3846 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3847 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3848 ### if the C happens to be at the last position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3849 my $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3850 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3851
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3852 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3853
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3854 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3855 if ($pbat_index == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3856 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3857 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3858 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3859 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3860 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3861
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3862 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3863 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3864 ### + 2 extra base at the 3' end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3865 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3866 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3867 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3868 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3869 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3871
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3872 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3873 elsif ($pbat_index == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3874 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3875 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3876 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3877 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3878 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3879
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3880 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3881 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3882 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3883 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3884 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3885 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3886 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3887 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3888 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3889 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3890 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3891
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3892 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3893 elsif ($pbat_index == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3894 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3895 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3896 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3897 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3898 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3900 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3901 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3902 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3903 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3904 ## reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3905 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3906 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3907 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3908 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3909 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3910 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3911
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3912 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3913 elsif ($pbat_index == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3914 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3915 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3916 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3917 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3918 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3919
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3920 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3921 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3922 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3923 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3925 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3926 $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3927 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3929 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3930 die "Too many bowtie result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3931 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3932
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3933 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3934 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3935 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3936 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3937
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3938 ### at this point we can also determine the end position of a read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3939 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3940 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3941
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3942
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3943 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3944 my ($sequence_identifier,$methylation_call_params) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3945
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3946 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3947 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3948
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3949 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3950 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3951
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3952 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3953 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3954 my $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3955 my $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3956 my $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3957 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3958 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3959 my $non_bisulfite_sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3960
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3961 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3962 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3963
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3964 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3965 my @len = split (/\D+/,$cigar); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3966 my @ops = split (/\d+/,$cigar); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3967 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3968 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3969
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3970 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3971 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3972 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3973 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3974 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3975 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3976 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3977 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3978 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3979 my $indels = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3980
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3981 foreach (0..$#len){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3982 if ($ops[$_] eq 'M'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3983 #extracting genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3984 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3985 # adjusting position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3986 $pos += $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3987 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3988 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3989 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3990 $non_bisulfite_sequence .= 'N' x $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3991 # warn "$non_bisulfite_sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3992 # position doesn't need to be adjusting
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3993 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3994 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3995 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3996 # we do not add any genomic sequence but only adjust the position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3997 $pos += $len[$_];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3998 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3999 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4000 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4001 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4002 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4003 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4004 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4005 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4006 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4007
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4008 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4009 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4010 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4011 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4012 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4013 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4014 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4015 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4016 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4017 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4018
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4019
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4020
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4021 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4022 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4023 ### [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4024 $counting{CT_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4025 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4026 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4027 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4028 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4029
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4030 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4031 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4032 ### [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4033 $counting{CT_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4034 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4035 $read_conversion_info = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4036 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4037
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4038 ### reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4039 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4040 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4041
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4042 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4043 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4044 ### [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4045 $counting{GA_CT_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4046 $alignment_strand = '-';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4047 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4048 $genome_conversion = 'CT';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4049
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4050 ### reverse complement!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4051 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4052 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4053
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4054 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4055 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4056 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4057 $counting{GA_GA_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4058 $alignment_strand = '+';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4059 $read_conversion_info = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4060 $genome_conversion = 'GA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4061
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4062 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4063 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4064 die "Too many Bowtie 2 result filehandles\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4065 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4066
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4067 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4068 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4069 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4070 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4071
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4072 ### the end position of a read is stored in $pos
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4073 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4074 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4075 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4076
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4077 ### METHYLATION CALL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4078
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4079 sub methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4080 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4081 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4082 my @seq = split(//,$sequence_actually_observed);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4083 my @genomic = split(//,$genomic_sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4084 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4085 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4086 ### CpG, CHH or CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4087
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4088 #################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4089 ### . for bases not involving cytosines ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4090 ### X for methylated C in CHG context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4091 ### x for not methylated C in CHG context (was converted) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4092 ### H for methylated C in CHH context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4093 ### h for not methylated C in CHH context (was converted) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4094 ### Z for methylated C in CpG context (was protected) ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4095 ### z for not methylated C in CpG context (was converted) ###
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4096 ### U for methylated C in unknown context (was protected) ###
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4097 ### u for not methylated C in unknwon context (was converted) ###
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4098 #################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4099
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4100 my @match =();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4102 my $methyl_CHH_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4103 my $methyl_CHG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4104 my $methyl_CpG_count = 0;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4105 my $methyl_C_unknown_count = 0;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4106 my $unmethylated_CHH_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4107 my $unmethylated_CHG_count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4108 my $unmethylated_CpG_count = 0;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4109 my $unmethylated_C_unknown_count = 0;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4110
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4111 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4112 for my $index (0..$#seq) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4113 if ($seq[$index] eq $genomic[$index]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4115 if ($genomic[$index] eq 'C') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4116 ### If the residue is a C we want to know if it was in CpG context or in any other context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4117 my $downstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4118
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4119 if ($downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4120 ++$methyl_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4121 push @match,'Z'; # protected C, methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4122 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4124 ++$methyl_C_unknown_count;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4125 push @match,'U'; # protected C, methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4126 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4127 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4128 ### C in not in CpG-context, determining the second downstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4129 my $second_downstream_base = $genomic[$index+2];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4130
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4131 if ($second_downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4132 ++$methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4133 push @match,'X'; # protected C, methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4134 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4135 elsif ($second_downstream_base eq 'N'){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4137 push @match,'U'; # protected C, methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4138 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4139 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4140 ++$methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4141 push @match,'H'; # protected C, methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4142 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4143 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4144 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4145 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4146 push @match, '.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4147 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4148 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4149 elsif ($seq[$index] ne $genomic[$index]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4150 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4151 ### in the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4152 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4153 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4154 my $downstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4156 if ($downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4157 ++$unmethylated_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4158 push @match,'z'; # converted C, not methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4159 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4161 ++$unmethylated_C_unknown_count;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4162 push @match,'u'; # converted C, not methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4163 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4164 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4165 ### C in not in CpG-context, determining the second downstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4166 my $second_downstream_base = $genomic[$index+2];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4167
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4168 if ($second_downstream_base eq 'G'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4169 ++$unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4170 push @match,'x'; # converted C, not methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4171 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4172 elsif ($second_downstream_base eq 'N'){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4174 push @match,'u'; # converted C, not methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4175 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4176 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4177 ++$unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4178 push @match,'h'; # converted C, not methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4179 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4180 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4181 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4182 ### all other mismatches are not of interest for a methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4183 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4184 push @match,'.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4185 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4186 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4187 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4188 die "There can be only 2 possibilities\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4189 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4190 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4191 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4192 elsif ($read_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4193 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4194
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4195 for my $index (0..$#seq) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4196 if ($seq[$index] eq $genomic[$index+2]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4197 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4198 if ($genomic[$index+2] eq 'G') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4199 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4200 ### to look if the base upstream is a C
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4201
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4202 my $upstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4203
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4204 if ($upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4205 ++$methyl_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4207 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4209 ++$methyl_C_unknown_count;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4211 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4212 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4213 ### C in not in CpG-context, determining the second upstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4214 my $second_upstream_base = $genomic[$index];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4215
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4216 if ($second_upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4217 ++$methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4219 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4220 elsif ($second_upstream_base eq 'N'){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4222 push @match,'U'; # protected C, methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4223 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4224 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4225 ++$methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4228 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4229 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4230 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4231 push @match, '.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4232 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4234 elsif ($seq[$index] ne $genomic[$index+2]) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4235 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4236 ### on the opposing strand, so G to A conversions in the actually observed sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4237 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4238 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4239 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4240
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4241 my $upstream_base = $genomic[$index+1];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4242
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4243 if ($upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4244 ++$unmethylated_CpG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4246 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4248 ++$unmethylated_C_unknown_count;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4250 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4251 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4252 ### C in not in CpG-context, determining the second upstream base context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4253 my $second_upstream_base = $genomic[$index];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4254
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4255 if ($second_upstream_base eq 'C'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4256 ++$unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4258 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4259 elsif ($second_upstream_base eq 'N'){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4262 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4263 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4264 ++$unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4266 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4267 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4268 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4269 ### all other mismatches are not of interest for a methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4270 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4271 push @match,'.';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4272 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4273 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4274 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4275 die "There can be only 2 possibilities\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4276 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4277 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4278 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4279 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4280 die "Strand conversion info is required to perform a methylation call\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4281 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4282
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4283 my $methylation_call = join ("",@match);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4284
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4285 $counting{total_meCHH_count} += $methyl_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4286 $counting{total_meCHG_count} += $methyl_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4287 $counting{total_meCpG_count} += $methyl_CpG_count;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4295 return $methylation_call;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4296 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4297
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4298 sub read_genome_into_memory{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4299 ## working directoy
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4300 my $cwd = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4301 ## reading in and storing the specified genome in the %chromosomes hash
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4302 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4303 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4304
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4305 my @chromosome_filenames = <*.fa>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4307 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4308 unless (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4309 @chromosome_filenames = <*.fasta>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4310 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4311
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4312 unless (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4313 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4314 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4315
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4316 foreach my $chromosome_filename (@chromosome_filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4317
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4318 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4319 ### first line needs to be a fastA header
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4320 my $first_line = <CHR_IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4321 chomp $first_line;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4322 $first_line =~ s/\r//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4323
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4324 ### Extracting chromosome name from the FastA header
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4325 my $chromosome_name = extract_chromosome_name($first_line);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4326
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4327 my $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4328 while (<CHR_IN>){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4329 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4330 $_ =~ s/\r//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4331 if ($_ =~ /^>/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4332 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4333 if (exists $chromosomes{$chromosome_name}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4334 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4335 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4336 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4337 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4338 if (length($sequence) == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4339 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4341 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4342 $chromosomes{$chromosome_name} = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4343 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4344 ### resetting the sequence variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4345 $sequence = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4346 ### setting new chromosome name
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4347 $chromosome_name = extract_chromosome_name($_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4348 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4349 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4350 $sequence .= uc$_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4351 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4352 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4353
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4354 if (exists $chromosomes{$chromosome_name}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4355 print "chr $chromosome_name (",length $sequence ," bp)\t";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4356 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4357 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4358 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4359 if (length($sequence) == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4360 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4361 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4362 print "chr $chromosome_name (",length $sequence ," bp)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4363 $chromosomes{$chromosome_name} = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4364 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4365 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4366 print "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4367 chdir $cwd or die "Failed to move to directory $cwd\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4369
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4370 sub extract_chromosome_name {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4371 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4372 my $fasta_header = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4373 if ($fasta_header =~ s/^>//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4374 my ($chromosome_name) = split (/\s+/,$fasta_header);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4375 return $chromosome_name;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4376 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4377 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4378 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4380 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4381
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4382 sub reverse_complement{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4383 my $sequence = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4384 $sequence =~ tr/CATG/GTAC/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4385 $sequence = reverse($sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4386 return $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4387 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4388
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4389 sub biTransformFastAFiles {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4390 my $file = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4391 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4392 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4393 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4394 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4395 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4396 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4397 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4398
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4399 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4400 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4401 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4402 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4403 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4404 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4406
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4407 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4408 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4409 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4410 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4411 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4412 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4413 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4414 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4415
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4416 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4417
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4418 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4419 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4420 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4421 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4422 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4423 $C_to_T_infile =~ s/$/_C_to_T.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4424 $G_to_A_infile =~ s/$/_G_to_A.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4426
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4427 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4429 $C_to_T_infile = "$prefix.$C_to_T_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4430 $G_to_A_infile = "$prefix.$G_to_A_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4432 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4433
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4435
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4436 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4439 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4440 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4443 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4444 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4445 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4446 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4447 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4448 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4449 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4450 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4451 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4452
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4453 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4454
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4455 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4456 my $header = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4457 my $sequence= <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4458 last unless ($header and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4459
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4460 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4461
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4462 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4463
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4464 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4465 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4466 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4467 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4468 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4469 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4470
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4471 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4472
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4473 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4474 if (index($header,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4475 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4476 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4477
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4478 ### small check if the sequence seems to be in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4479 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4480
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4481 my $sequence_C_to_T = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4482 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4483 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4484
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4485 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4486 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4487 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4488 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4489 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4490 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4491 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4492
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4493 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4494 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4495 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4496 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4497 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4498 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4499 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4500 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4501 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4502
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4503 sub biTransformFastAFiles_paired_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4504 my ($file,$read_number) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4505
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4506 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4507 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4508 sleep (2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4509 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4510
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4511 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4512 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4513 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4514 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4515 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4516 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4517 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4518
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4519 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4520 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4521 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4522 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4523 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4524 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4526
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4527 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4528 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4529 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4530 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4531 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4532 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4533 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4534 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4535
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4536 my $C_to_T_infile = my $G_to_A_infile = $filename;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4537
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4538 $C_to_T_infile =~ s/$/_C_to_T.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4539 $G_to_A_infile =~ s/$/_G_to_A.fa/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4540
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4541 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4543 $C_to_T_infile = "$prefix.$C_to_T_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4544 $G_to_A_infile = "$prefix.$G_to_A_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4546 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4547
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4548 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4549 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4552 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4553 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4554 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4555 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4556 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4557 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4558 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4559 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4560 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4561 else{ # all four strand output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4562 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4563 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4564 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4565 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4566 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4567
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4568 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4569
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4570 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4571 my $header = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4572 my $sequence= <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4573 last unless ($header and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4574
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4575 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4576
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4577 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4578
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4579 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4580 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4581 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4582 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4583 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4584 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4585
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4586 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4587
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4588 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4589 if (index($header,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4590 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4592
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4593 ## small check if the sequence seems to be in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4594 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4595
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4596 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4597 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4598 $header =~ s/$/\/1\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4599 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4600 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4601 $header =~ s/$/\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4602 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4603 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4604 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4605 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4606 $header =~ s/$/\/2\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4607 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4608 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4609 $header =~ s/$/\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4610 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4611 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4612 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4613 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4614 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4615 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4616
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4617 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4618 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4619
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4620 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4621
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4622 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4623 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4624 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4625 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4626 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4627 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4628 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4629 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4630 print CTOT "$header$sequence_C_to_T";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4631 print GTOA "$header$sequence_G_to_A";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4632 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4633 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4634
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4635 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4636 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4637 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4639 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4640 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4641 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4642 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4643 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4644 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4645 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4646
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4647 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4648 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4649 return ($C_to_T_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4650 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4651 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4652 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4653 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4654 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4655 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4656 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4657 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4658 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4659
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4660
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4661 sub biTransformFastQFiles {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4662 my $file = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4663 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4664 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4665 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4666 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4667 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4668 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4669 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4670
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4671 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4672 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4673 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4674 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4675 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4676 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4678
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4679 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4680 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4681 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4682 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4683 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4684 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4685 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4688 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4689
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4690 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4692 $C_to_T_infile = "$prefix.$C_to_T_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4693 $G_to_A_infile = "$prefix.$G_to_A_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4695 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4696
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4697 if ($pbat){ # PBAT-Seq
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4698 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4700 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4701 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4702 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4703 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4704
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4705 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4706
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4707 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4708 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4710 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4711 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4712 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4713 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4714 else{ # directional or non-directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4715 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4716 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4717 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4718 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4719 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4720 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4721
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4722 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4723
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4724 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4725 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4726 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4727 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4728 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4729 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4730
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4731 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4732 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4733 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4734 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4735 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4736 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4737 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4738
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4739 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4740
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4741 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4742 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4743 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4744 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4745 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4746 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4747 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4748 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4749
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4750 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4751 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4752 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4753 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4754 my $identifier2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4755 my $quality_score = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4756 last unless ($identifier and $sequence and $identifier2 and $quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4757
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4758 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4759
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4760 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4762 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4763 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4764 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4765 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4766 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4767 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4768
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4769 $sequence = uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4770
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4771 # detecting if the input file contains tab stops, as this is likely to result in no alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4772 if (index($identifier,"\t") != -1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4773 $seqID_contains_tabs++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4774 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4775
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4776 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4777 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4778 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4779 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4780 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4782
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4783 if ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4784 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4785 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4786 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4787 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4788 else{ # directional or non-directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4789 my $sequence_C_to_T = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4790 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4791 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4792
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4793 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4794 my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4795 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4796 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4797 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4798 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4799 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4800
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4801 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4802 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4803 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4804 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4805 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4806 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4807 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4808 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4809 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4810 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4811 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4812 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4813 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4814 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4815
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4816 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4817 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4818
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4819 sub biTransformFastQFiles_paired_end {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4820 my ($file,$read_number) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4821 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4822
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4823 if ($file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4824 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4825 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4826 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4827 $filename = $file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4828 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4829
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4830 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4831 if ($file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4832 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4833 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4834 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4835 open (IN,$file) or die "Couldn't read from file $file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4836 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4837
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4838 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4839 warn "Skipping the first $skip reads from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4840 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4841 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4842 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4843 warn "Processing reads up to sequence no. $upto from $file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4844 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4845 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4846
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4847 my $C_to_T_infile = my $G_to_A_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4848
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4849 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4850 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4852 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4853 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4856 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4857
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4858 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4860 $C_to_T_infile = "$prefix.$C_to_T_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4861 $G_to_A_infile = "$prefix.$G_to_A_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4863 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
4864
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4865 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4866 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4868 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4869 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4871 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4872 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4873 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4874 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4875 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4876 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4877 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4878 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4879 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4880 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4881 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4882 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4883 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4884 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4885 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4886 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4887 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4888 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4889 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4890 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4891 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4892 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4893 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4894 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4895 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4896 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4897 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4898 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4899 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4900
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4901 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4902 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4903 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4904 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4905 my $identifier2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4906 my $quality_score = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4907 last unless ($identifier and $sequence and $identifier2 and $quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4908 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4909
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4910 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4911
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4912 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4913 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4914 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4915 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4916 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4917 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4918
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4919 $sequence= uc$sequence; # make input file case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4920
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4921 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4922 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4923 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4924 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4925 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4926 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4927 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4928
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4929 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4930 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4931 $identifier =~ s/$/\/1\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4932 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4933 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4934 $identifier =~ s/$/\/1/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4935 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4936 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4937 elsif ($read_number == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4938 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4939 $identifier =~ s/$/\/2\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4940 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4941 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4942 $identifier =~ s/$/\/2/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4943 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4944 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4945 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4946 die "Read number needs to be 1 or 2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4947 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4948
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4949 $sequence_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4950 $sequence_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4951
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4952 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4953 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4954 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4955 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4956 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4957 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4958 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4959 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4960 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4961 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4962 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4963 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4964 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4965
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4966 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4967 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4968 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4969 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4970 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4971 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4972 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4973 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4974 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4975 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4976 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4977 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4978 if ($read_number == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4979 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4980 return ($C_to_T_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4981 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4982 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4983 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4984 return ($G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4985 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4986 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4987 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4988 close CTOT or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4989 close GTOA or die "Failed to close filehandle $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4990 return ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4991 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4992 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4993
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4994
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4995 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4996
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4997 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4998 my ($file_1,$file_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4999 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5000
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5001 if ($file_1 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5002 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5003 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5004 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5005 $filename = $file_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5006 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5007
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5008 ### gzipped version of infile 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5009 if ($file_1 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5010 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5011 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5012 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5013 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5014 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5015 ### gzipped version of infile 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5016 if ($file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5017 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5018 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5019 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5020 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5021 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5022
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5023
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5024 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5025 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5026 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5027 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5028 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5029 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5030 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5031 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5032
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5034
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5035 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5040 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5041
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5045
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5049
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5050 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5051 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5052 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5053 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5054
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5055 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5056 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5057
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5058 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5059 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5060 my $identifier_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5061 my $sequence_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5062 my $identifier2_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5063 my $quality_score_1 = <IN_1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5064
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5065 my $identifier_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5066 my $sequence_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5067 my $identifier2_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5068 my $quality_score_2 = <IN_2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5069
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5070 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5071
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5072 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5073
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5074 ## small check if the sequence file appears to be a FastQ file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5075 if ($count == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5076 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5077 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5078 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5079 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5080 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5081 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5082 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5083
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5084 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5085 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5086 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5087 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5088 chomp $quality_score_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5089 chomp $quality_score_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5090
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5091 $identifier_1 =~ s/^\@//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5092 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5093
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5094 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5095 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5096 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5097 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5098 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5099 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5100
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5101 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5102 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5103
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5104 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5105 my $sequence_1_C_to_T = $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5106 my $sequence_2_G_to_A = $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5107 $sequence_1_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5108 $sequence_2_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5109
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5110 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5111
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5112 unless ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5113 my $sequence_1_G_to_A = $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5114 my $sequence_2_C_to_T = $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5115 $sequence_1_G_to_A =~ tr/G/A/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5116 $sequence_2_C_to_T =~ tr/C/T/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5117 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5118 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5119 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5120
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5121 close CTPLUSGA or die "Couldn't close filehandle\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5122 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5123
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5124 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5125 warn "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5126 return ($CT_plus_GA_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5127 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5128 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5129 close GAPLUSCT or die "Couldn't close filehandle\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5130 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5131 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5132 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5133 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5135
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5136 sub fix_IDs{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5137 my $id = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5138 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5139 return $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5140 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5141
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5142 sub ensure_sensical_alignment_orientation_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5143 my $index = shift; # index number if the sequence produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5144 my $strand = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5145 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5146 my $orientation = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5147 ##############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5148 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5149 ## here we only want reads in the forward (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5150 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5151 ### if the alignment is (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5152 if ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5153 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5154 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5155 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5156 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5157 ### if the orientation equals (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5158 elsif ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5159 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5160 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5161 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5162 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5163 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5164 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5165 ## here we only want reads in the forward (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5166 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5167 ### if the alignment is (-) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5168 if ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5169 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5170 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5171 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5172 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5173 ### if the orientation equals (+) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5174 elsif ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5175 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5176 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5177 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5178 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5179 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5180 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5181 ## here we only want reads in the forward (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5182 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5183 ### if the alignment is (-) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5184 if ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5185 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5186 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5187 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5188 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5189 ### if the orientation equals (+) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5190 elsif ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5191 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5192 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5193 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5194 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5195 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5196 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5197 ## here we only want reads in the forward (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5198 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5199 ### if the alignment is (+) we count it and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5200 if ($strand eq '+') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5201 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5202 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5203 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5205 ### if the orientation equals (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5206 elsif ($strand eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5207 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5208 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5210 } else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5211 die "One of the above conditions must be true\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5212 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5213 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5214
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5215 sub ensure_sensical_alignment_orientation_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5216 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5217 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5218 my $orientation = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5219 ##############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5220 ## [Index 0, sequence originated from (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5221 ## CT converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5222 ## GA converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5223 ## CT converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5224 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5225 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5226 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5227 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5228 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5229 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5230 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5231 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5232 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5233 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5234 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5235 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5236 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5237 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5238 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5239 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5240 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5241 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5242 ## [Index 1, sequence originated from (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5243 ## GA converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5244 ## CT converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5245 ## GA converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5246 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5247 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5248 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5249 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5250 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5251 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5252 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5253 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5254 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5255 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5256 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5257 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5258 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5259 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5260 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5261 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5262 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5263 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5264 ## [Index 2, sequence originated from complementary to (converted) forward strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5265 ## GA converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5266 ## CT converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5267 ## CT converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5268 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5269 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5270 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5271 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5272 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5273 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5274 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5276 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5277 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5278 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5279 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5281 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5282 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5283 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5284 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5285 ###############################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5286 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5287 ## CT converted read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5288 ## GA converted read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5289 ## GA converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5290 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5291 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5292 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5293 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5294 $fhs[$index]->{seen}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5295 $orientation = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5296 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5297 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5298 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5299 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5300 $fhs[$index]->{wrong_strand}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5301 return $orientation;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5302 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5303 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5304 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5306 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5307 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5308 die "One of the above conditions must be true\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5309 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5310 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5311
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5312 #####################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5313
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5314 ### Bowtie 1 (default) | PAIRED-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5315
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5316 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5317
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5318 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5319
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5320 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5321 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5322 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5323 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5324 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5325 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5326
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5327 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5328 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5329 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5330 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5331 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5332 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5333 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5334 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5335
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5336 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5337
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5338 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5339 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5340 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5341 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5342 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5343 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5344 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5345 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5346
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5347 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5348 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5349 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5350 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5351 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5352 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5353 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5354
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5355 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5356 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5358 my $line_1 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5359 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5360
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5361 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5362 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5363 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5364 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5365 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5366 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5367
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5368 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5369 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5370
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5371 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5372 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5374 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5375 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5376 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5377 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5378 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5380
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5381 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5382 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5383 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5384 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5385 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5386 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5387 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5388 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5389 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5390 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5391 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5392 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5393 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5394
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5395 ### Bowtie 2 | PAIRED-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5396
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5397 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5398 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5399 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5400 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5401 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5402 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5403 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5404 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5405
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5406 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5407 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5408 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5409 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5410 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5411 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5412 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5413 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5414
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5415 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5416
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5417 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5418 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5419 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5420 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5421 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5422 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5423 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5424 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5425
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5426 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5427 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5428 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5429 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5430 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5431 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5433
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5434 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5435 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5436
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5437 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5438 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5439 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5440 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5441 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5442 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5443 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5444 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5446 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5447
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5448 my $line_1 = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5449 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5450
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5451 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5452 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5453 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5454 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5455 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5456 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5457
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5458 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5459 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5460
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5461 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5462 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5463 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5464 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5465 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5466 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5467 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5468 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5469 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5470
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5471 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5472 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5473 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5474 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5475 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5476 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5477 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5478 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5479 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5480 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5481 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5482 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5483 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5484
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5485 ### Bowtie 1 (default) | PAIRED-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5486
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5487 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5488 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5489
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5490 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5491 warn "Input file is $C_to_T_infile_1 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5492 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5493 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5494 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5495 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5496 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5497 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5498 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5499
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5500 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5501 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5502 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5503 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5504 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5505 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5506 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5507 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5508
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5509 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5510
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5511 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5512 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5513 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5514 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5515 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5516 next; # skipping unwanted filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5517 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5518 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5519
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5520 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5521 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5522 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5523 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5524 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5525 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5526 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5527
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5528 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5529 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5530 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5531 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5532 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5533 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5534 sleep(5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5535 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5536 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5537
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5538 my $line_1 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5539 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5540
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5541 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5542 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5543 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5544 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5545 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5546 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5547
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5548 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5549 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5551 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5552 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5553 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5554 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5555 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5556 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5557 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5558 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5559 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5560
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5561 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5562 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5563 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5564 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5565
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5566 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5567 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5568 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5569 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5570 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5571 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5572 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5573 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5574 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5575
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5576 ### Bowtie 2 | PAIRED-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5577
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5578 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5579 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5580 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5581 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5582 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5583 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5584 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5585 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5586
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5587 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5588 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5589 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5590 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5592 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5593 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5594 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5595
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5596 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5597
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5598 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5599 unless ($fh->{inputfile_1}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5600 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5601 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5602 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5603 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5604 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5605 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5606
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5607 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5608 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5609 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5610 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5611 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5612 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5613 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5614
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5615 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5616 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5617
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5618 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5619 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5620 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5621 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5622 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5623 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5624 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5625 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5626 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5627 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5628
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5629 my $line_1 = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5630 my $line_2 = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5631
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5632 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5633 if ($line_1 and $line_2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5634 chomp $line_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5635 chomp $line_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5636 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5637 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5638
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5639 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5640 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5641
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5642 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5643 $fh->{last_seq_id} = $id_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5644 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5645 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5646 $fh->{last_seq_id} = $id_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5647 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5648 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5649 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5650 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5651
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5652 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5653 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5654 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5655 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5656
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5657 # otherwise we just initialise last_seq_id and last_lines as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5658 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5659 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5660 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5661 $fh->{last_line_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5662 $fh->{last_line_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5663 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5664 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5665 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5666
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5667 #####################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5668
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5669 ### Bowtie 1 (default) | SINGLE-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5670 sub single_end_align_fragments_to_bisulfite_genome_fastA {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5671 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5672 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5673 warn "Input file is $C_to_T_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5674 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5675 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5676 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5678
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5679 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5680 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5681 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5682 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5683 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5684 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5685 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5688 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5689
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5690 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5691 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5692 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5693 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5694 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5695 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5697
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5698 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5699 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5700 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5701 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5702 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5703 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5704 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5705
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5706 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5707 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5708 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5709 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5710 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5711 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5712 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5713 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5714 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5715 # otherwise we just initialise last_seq_id and last_line as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5716 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5717 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5718 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5719 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5720 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5721 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5722 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5723
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5724 ### Bowtie 2 | SINGLE-END | FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5725 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5726 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5727 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5728 warn "Input file is $C_to_T_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5729 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5730 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5731 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5732 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5733
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5734 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5735 ## data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5736 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5737 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5738 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5739 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5740 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5741 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5742
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5743 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5744
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5745 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5746 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5747 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5748 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5749 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5750 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5751 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5752
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5753 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5754 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5755
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5756 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5757 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5758 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5759 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5760 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5761 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5762 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5763 last; # no alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5764 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5765 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5766
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5767 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5768 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5769 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5770 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5771 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5772 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5773 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5774 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5775 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5776 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5777 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5778 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5779 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5780 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5782 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5783
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5784
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5785 ### Bowtie 1 (default) | SINGLE-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5786 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5787 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5788 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5789 warn "Input file is $C_to_T_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5790 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5791 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5792 warn "Input file is $G_to_A_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5793 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5794 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5795 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5796 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5797
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5799 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5800 ## the data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5801 if ($directional or $pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5802 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5803 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5804 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5805 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5806 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5807
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5808 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5809 my $bt_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5810 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5811 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5812 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5813 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5814 $bt_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5815 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5816
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5817 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5818 sleep (5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5819
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5820 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5821 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5822 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5823 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5824 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5825 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5826
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5827 # if Bowtie produces an alignment we store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5828 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5829 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5830 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5831 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5832 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5833 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5834 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5835 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5836 # otherwise we just initialise last_seq_id and last_line as undefined
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5837 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5838 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5839 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5840 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5841 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5842 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5844
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5845 ### Bowtie 2 | SINGLE-END | FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5846 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5847
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5848 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5849 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5850 warn "Input file is $C_to_T_infile (FastQ)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5851 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5852 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5853 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5854 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5855
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5856 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5857 ## the data structure above
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5858 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5859 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5860 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5861 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5862 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5863 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5864 foreach my $fh (@fhs) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5865 my $bt2_options = $bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5866 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5867 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5868 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5869 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5870 $bt2_options .= ' --nofw';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5871 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5872 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5873 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5874
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5875 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5876 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5877 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5878 $_ = $fh->{fh}->getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5879 # warn "$_\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5880 # sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5881 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5882 last unless ($_ =~ /^\@/); # SAM headers start with @
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5883 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5884 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5885 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5886 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5887 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5888
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5889 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5890 if ($_) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5891 chomp;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5892 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5893 $fh->{last_seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5894 $fh->{last_line} = $_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5895 warn "Found first alignment:\t$fh->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5896 # warn "storing $id and\n$_\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5897 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5898 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5899 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5900 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5901 $fh->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5902 $fh->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5903 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5904 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5905 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5906
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5907 ###########################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5908
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5909 sub reset_counters_and_fhs{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5910 my $filename = shift;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5911 %counting=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5912 total_meCHH_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5913 total_meCHG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5914 total_meCpG_count => 0,
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5915 total_meC_unknown_count => 0,
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5916 total_unmethylated_CHH_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5917 total_unmethylated_CHG_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5918 total_unmethylated_CpG_count => 0,
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
5919 total_unmethylated_C_unknown_count => 0,
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5920 sequences_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5921 no_single_alignment_found => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5922 unsuitable_sequence_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5923 genomic_sequence_could_not_be_extracted_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5924 unique_best_alignment_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5925 low_complexity_alignments_overruled_count => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5926 CT_CT_count => 0, #(CT read/CT genome, original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5927 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5928 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5929 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5930 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5931 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5932 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5933 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5934 alignments_rejected_count => 0, # only relevant if --directional was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5935 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5936
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5937 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5938 if ($filename =~ ','){ # paired-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5939 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5940 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5941 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5942 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5943 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5944 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5945 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5946 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5947 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5948 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5949 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5950 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5951 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5952 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5953 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5954 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5955 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5956 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5957 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5958 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5959 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5960 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5961 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5962 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5963 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5964 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5965 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5966 else{ # single-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5967 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5968 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5969 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5970 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5971 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5972 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5973 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5974 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5975 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5976 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5977 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5978 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5979 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5980 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5981 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5982 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5983 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5984 if ($filename =~ ','){ # paired-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5985 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5986 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5987 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5988 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5989 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5990 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5991 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5992 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5993 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5994 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5995 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5996 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5997 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5998 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5999 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6000 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6001 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6002 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6003 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6004 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6005 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6006 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6007 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6008 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6009 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6010 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6011 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6012 else{ # single-end files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6013 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6014 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6015 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6016 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6017 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6018 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6019 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6020 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6021 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6022 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6023 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6024 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6025 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6026 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6027 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6028 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6029 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6030 @fhs=(
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6031 { name => 'CTreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6032 strand_identity => 'con ori forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6033 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6034 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6035 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6036 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6037 { name => 'CTreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6038 strand_identity => 'con ori reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6039 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6040 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6041 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6042 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6043 { name => 'GAreadCTgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6044 strand_identity => 'compl ori con forward',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6045 bisulfiteIndex => $CT_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6046 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6047 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6048 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6049 { name => 'GAreadGAgenome',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6050 strand_identity => 'compl ori con reverse',
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6051 bisulfiteIndex => $GA_index_basename,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6052 seen => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6053 wrong_strand => 0,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6054 },
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6055 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6056 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6057 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6058
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6059
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6060 sub process_command_line{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6061 my @bowtie_options;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6062 my $help;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6063 my $mates1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6064 my $mates2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6065 my $path_to_bowtie;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6066 my $fastq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6067 my $fasta;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6068 my $skip;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6069 my $qupto;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6070 my $phred64;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6071 my $phred33;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6072 my $solexa;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6073 my $mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6074 my $seed_length;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6075 my $best;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6076 my $sequence_format;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6077 my $version;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6078 my $quiet;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6079 my $chunk;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6080 my $non_directional;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6081 my $ceiling;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6082 my $maxins;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6083 my $minins;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6084 my $unmapped;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6085 my $multi_map;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6086 my $output_dir;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6087 my $bowtie2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6088 my $vanilla;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6089 my $sam_no_hd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6090 my $seed_extension_fails;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6091 my $reseed_repetitive_seeds;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6092 my $most_valid_alignments;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6093 my $score_min;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6094 my $parallel;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6095 my $temp_dir;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6096 my $rdg;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6097 my $rfg;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6098 my $non_bs_mm;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6099 my $samtools_path;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6100 my $bam;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6101 my $gzip;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6102 my $pbat;
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6103 my $prefix;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6104 my $old_flag;
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6105
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6106 my $command_line = GetOptions ('help|man' => \$help,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6107 '1=s' => \$mates1,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6108 '2=s' => \$mates2,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6109 'path_to_bowtie=s' => \$path_to_bowtie,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6110 'f|fasta' => \$fasta,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6111 'q|fastq' => \$fastq,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6112 's|skip=i' => \$skip,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6113 'u|upto=i' => \$qupto,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6114 'phred33-quals' => \$phred33,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6115 'phred64-quals|solexa1' => \$phred64,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6116 'solexa-quals' => \$solexa,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6117 'n|seedmms=i' => \$mismatches,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6118 'l|seedlen=i' => \$seed_length,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6119 'no_best' => \$best,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6120 'version' => \$version,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6121 'quiet' => \$quiet,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6122 'chunkmbs=i' => \$chunk,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6123 'non_directional' => \$non_directional,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6124 'I|minins=i' => \$minins,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6125 'X|maxins=i' => \$maxins,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6126 'e|maqerr=i' => \$ceiling,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6127 'un|unmapped' => \$unmapped,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6128 'ambiguous' => \$multi_map,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6129 'o|output_dir=s' => \$output_dir,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6130 'bowtie2' => \$bowtie2,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6131 'vanilla' => \$vanilla,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6132 'sam-no-hd' => \$sam_no_hd,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6133 'D=i' => \$seed_extension_fails,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6134 'R=i' => \$reseed_repetitive_seeds,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6135 'score_min=s' => \$score_min,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6136 'most_valid_alignments=i' => \$most_valid_alignments,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6137 'p=i' => \$parallel,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6138 'temp_dir=s' => \$temp_dir,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6139 'rdg=s' => \$rdg,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6140 'rfg=s' => \$rfg,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6141 'non_bs_mm' => \$non_bs_mm,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6142 'samtools_path=s' => \$samtools_path,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6143 'bam' => \$bam,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6144 'gzip' => \$gzip,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6145 'pbat' => \$pbat,
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6146 'prefix=s' => \$prefix,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6147 'old_flag' => \$old_flag,
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6148 );
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6149
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6150
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6151 ### EXIT ON ERROR if there were errors with any of the supplied options
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6152 unless ($command_line){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6153 die "Please respecify command line options\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6154 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6155 ### HELPFILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6156 if ($help){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6157 print_helpfile();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6158 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6159 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6160 if ($version){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6161 print << "VERSION";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6162
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6163
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6164 Bismark - Bisulfite Mapper and Methylation Caller.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6165
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6166 Bismark Version: $bismark_version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6167 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6168 www.bioinformatics.babraham.ac.uk/projects/
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6169
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6170
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6171 VERSION
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6172 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6173 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6174
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6175
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6176 ##########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6177 ### PROCESSING OPTIONS ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6178 ##########################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6179
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6180 unless ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6181 $bowtie2 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6182 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6183 unless ($sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6184 $sam_no_hd =0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6185 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6186
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6187 ### PATH TO BOWTIE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6188 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6189 if ($path_to_bowtie){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6190 unless ($path_to_bowtie =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6191 $path_to_bowtie =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6193 if (-d $path_to_bowtie){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6194 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6195 $path_to_bowtie = "${path_to_bowtie}bowtie2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6196 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6197 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6198 $path_to_bowtie = "${path_to_bowtie}bowtie";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6199 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6200 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6201 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6202 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6203 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6205 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6206 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6207 $path_to_bowtie = 'bowtie2';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6208 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6209 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6210 $path_to_bowtie = 'bowtie';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6211 warn "Path to Bowtie specified as: $path_to_bowtie\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6212 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6213 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6214
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6215 ### OUTPUT REQUESTED AS BAM FILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6216 if ($bam){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6217 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6218 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6219 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6221 ### PATH TO SAMTOOLS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6222 if (defined $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6223 # if Samtools was specified as full command
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6224 if ($samtools_path =~ /samtools$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6225 if (-e $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6226 # Samtools executable found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6227 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6228 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6229 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6230 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6231 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6232 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6233 unless ($samtools_path =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6234 $samtools_path =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6235 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6236 $samtools_path .= 'samtools';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6237 if (-e $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6238 # Samtools executable found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6239 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6240 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6241 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6242 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6243 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6244
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6245 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6246 $bam = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6248 # Check whether Samtools is in the PATH if no path was supplied by the user
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6249 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6251 $samtools_path = `which samtools`;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6252 chomp $samtools_path;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6253 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6254 $bam = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6255 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6256 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6257
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6258 unless (defined $samtools_path){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6259 $bam = 2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6260 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6261 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6262 sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6263 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6264
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6265
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6266 ####################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6267 ### PROCESSING ARGUMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6268
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6269 ### GENOME FOLDER
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6270 my $genome_folder = shift @ARGV; # mandatory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6271 unless ($genome_folder){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6272 warn "Genome folder was not specified!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6273 print_helpfile();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6274 exit;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6276
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6277 ### checking that the genome folder, all subfolders and the required bowtie index files exist
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6278 unless ($genome_folder =~/\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6279 $genome_folder =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6281
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6282 if (chdir $genome_folder){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6283 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6284 unless ($absolute_genome_folder =~/\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6285 $absolute_genome_folder =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6286 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6287 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6288 $genome_folder = $absolute_genome_folder;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6290 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6291 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6292 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6294 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6295 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6296
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6297 if ($bowtie2){ ### Bowtie 2 (new)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6298 ### checking the integrity of $CT_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6301 foreach my $file(@CT_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6302 unless (-f $file){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6304 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6306 ### checking the integrity of $GA_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6309 foreach my $file(@GA_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6310 unless (-f $file){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6312 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6313 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6314 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6315
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6316 else{ ### Bowtie 1 (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6317 ### checking the integrity of $CT_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6320 foreach my $file(@CT_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6321 unless (-f $file){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6323 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6324 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6325 ### checking the integrity of $GA_dir
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6328 foreach my $file(@GA_bowtie_index){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6329 unless (-f $file){
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6331 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6332 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6333 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6334
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6335 my $CT_index_basename = "${CT_dir}BS_CT";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6336 my $GA_index_basename = "${GA_dir}BS_GA";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6337
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6338 ### INPUT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6339
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6340 ### SEQUENCE FILE FORMAT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6341 ### exits if both fastA and FastQ were specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6342 if ($fasta and $fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6343 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6344 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6345
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6346 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6347 if ($fasta){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6348 print "FastA format specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6349 $sequence_format = 'FASTA';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6350 push @bowtie_options, '-f';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6351 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6352 elsif ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6353 print "FastQ format specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6354 $sequence_format = 'FASTQ';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6355 push @bowtie_options, '-q';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6356 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6357 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6358 $fastq = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6359 print "FastQ format assumed (by default)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6360 $sequence_format = 'FASTQ';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6361 push @bowtie_options, '-q';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6362 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6363
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6364 ### SKIP
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6365 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6366 warn "Skipping the first $skip reads from the input file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6367 # push @bowtie_options,"-s $skip";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6369
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6370 ### UPTO
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6371 if ($qupto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6372 warn "Processing sequences up to read no. $qupto from the input file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6373 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6374 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6375 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6376 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6377 # push @bowtie_options,"--qupto $qupto";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6378 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6380
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6381 ### QUALITY VALUES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6382 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6383 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6384 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6385 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6386 # Phred quality values work only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6387 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6388 die "Phred quality values works only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6389 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6390 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6391 push @bowtie_options,"--phred33";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6392 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6393 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6394 push @bowtie_options,"--phred33-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6395 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6396 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6397 if ($phred64){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6398 # Phred quality values work only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6399 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6400 die "Phred quality values work only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6401 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6402 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6403 push @bowtie_options,"--phred64";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6404 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6405 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6406 push @bowtie_options,"--phred64-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6407 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6409 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6410 $phred64 = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6411 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6412
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6413 if ($solexa){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6414 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6415 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6416 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6417 # Solexa to Phred value conversion works only when -q is specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6418 unless ($fastq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6419 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6420 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6421 push @bowtie_options,"--solexa-quals";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6423 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6424 $solexa = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6426
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6427 ### ALIGNMENT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6428
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6429 ### MISMATCHES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6430 if (defined $mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6431 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6432 if ($mismatches == 0 or $mismatches == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6433 push @bowtie_options,"-N $mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6434 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6435 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6436 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6437 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6439 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6440 if ($mismatches >= 0 and $mismatches <= 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6441 push @bowtie_options,"-n $mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6442 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6443 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6444 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6446 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6447 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6448 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6449 unless ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6450 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6451 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6452 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6453
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6454 ### SEED LENGTH
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6455 if (defined $seed_length){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6456 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6457 push @bowtie_options,"-L $seed_length";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6458 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6459 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6460 push @bowtie_options,"-l $seed_length";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6461 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6462 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6463
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6464 ### MISMATCH CEILING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6465 if (defined $ceiling){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6466 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6467 push @bowtie_options,"-e $ceiling";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6468 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6469
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6470
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6471 ### BOWTIE 2 EFFORT OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6472
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6473 ### CONSECUTIVE SEED EXTENSION FAILS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6474 if (defined $seed_extension_fails){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6475 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6476 push @bowtie_options,"-D $seed_extension_fails";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6477 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6478
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6479 ### RE-SEEDING REPETITIVE SEEDS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6480 if (defined $reseed_repetitive_seeds){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6481 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6482 push @bowtie_options,"-R $reseed_repetitive_seeds";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6483 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6484
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6485
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6486 ### BOWTIE 2 SCORING OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6487 if ($score_min){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6488 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6489 unless ($score_min =~ /^L,.+,.+$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6490 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6491 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6492 push @bowtie_options,"--score-min $score_min";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6493 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6494 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6495 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6496 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6497 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6498 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6499
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6500 ### BOWTIE 2 READ GAP OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6501 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6502
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6503 if ($rdg){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6504 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6505 if ($rdg =~ /^(\d+),(\d+)$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6506 $deletion_open = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6507 $deletion_extend = $2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6508 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6509 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6510 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6511 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6512 push @bowtie_options,"--rdg $rdg";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6513 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6514 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6515 $deletion_open = 5;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6516 $deletion_extend = 3;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6517 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6518
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6519 ### BOWTIE 2 REFERENCE GAP OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6520 if ($rfg){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6521 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6522 if ($rfg =~ /^(\d+),(\d+)$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6523 $insertion_open = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6524 $insertion_extend = $2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6526 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6527 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6528 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6529 push @bowtie_options,"--rfg $rfg";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6530 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6531 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6532 $insertion_open = 5;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6533 $insertion_extend = 3;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6534 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6535
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6536
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6537 ### BOWTIE 2 PARALLELIZATION OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6538 if (defined $parallel){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6539 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6540 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6541 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6542 if ($parallel){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6543 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6544 push @bowtie_options,"-p $parallel";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6545 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6546 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6547 sleep (2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6548 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6549 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6551 ### REPORTING OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6552
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6553 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6554 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6555
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6556 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6557 if(defined $most_valid_alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6558
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6559 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6560 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6561 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6562 # else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6563 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6564 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6565 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6566 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6567 push @bowtie_options,'-k 2';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6568 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6569
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6570 ### --BEST
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6571 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6572 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6573 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6574 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6575 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6576 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6577 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6578 unless ($best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6579 push @bowtie_options,'--best';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6580 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6581 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6582
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6583 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6584 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6585 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6586 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6587 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6588 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6589 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6590 $vanilla = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6591 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6592
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6593 ### PAIRED-END MAPPING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6594 if ($mates1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6595 my @mates1 = (split (/,/,$mates1));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6596 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6597 my @mates2 = (split(/,/,$mates2));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6598 unless (scalar @mates1 == scalar @mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6599 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6600 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6601 while (1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6602 my $mate1 = shift @mates1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6603 my $mate2 = shift @mates2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6604 last unless ($mate1 and $mate2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6605 push @filenames,"$mate1,$mate2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6606 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6607 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6610 }
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6611
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6612 if ($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6614 sleep(3);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6615 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6616 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6617 elsif ($mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6619 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6620
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6621 ### SINGLE-END MAPPING
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6622 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6623 my $singles;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6624 unless ($mates1 and $mates2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6625 $singles = join (',',@ARGV);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6626 unless ($singles){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6627 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6628 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6629 $singles =~ s/\s/,/g;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6630 @filenames = (split(/,/,$singles));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6631 warn "\nFiles to be analysed:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6632 warn "@filenames\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6633 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6634 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6635
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6636 ### MININUM INSERT SIZE (PAIRED-END ONLY)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6637 if (defined $minins){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6638 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6639 push @bowtie_options,"--minins $minins";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6640 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6641
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6642 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6643 if (defined $maxins){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6644 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6645 push @bowtie_options,"--maxins $maxins";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6646 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6647 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6648 unless ($singles){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6649 push @bowtie_options,'--maxins 500';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6650 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6651 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6652
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6653 ### QUIET prints nothing besides alignments (suppresses warnings)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6654 if ($quiet){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6655 push @bowtie_options,'--quiet';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6656 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6657
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6658 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6659 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6660 if (defined $chunk){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6661 push @bowtie_options,"--chunkmbs $chunk";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6662 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6663 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6664 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6665 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6666 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6667
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6668
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6669 ### SUMMARY OF ALL BOWTIE OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6670 my $bowtie_options = join (' ',@bowtie_options);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6671
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6672
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6673 ### STRAND-SPECIFIC LIBRARIES
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6674 my $directional;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6675 if ($non_directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6676 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6677 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6678 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6679 $directional = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6680 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6681 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6682 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6683 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6684 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6685
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6686 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6687 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6688 $directional = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6689 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6690 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6691 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6692 sleep (3);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6693 $directional = 1; # default behaviour
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6694 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6695
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6696 ### UNMAPPED SEQUENCE OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6697 $unmapped = 0 unless ($unmapped);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6698
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6699 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6700 $multi_map = 0 unless ($multi_map);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6701
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6702
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6703 ### OUTPUT DIRECTORY
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6704
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6705 chdir $parent_dir or die "Failed to move back to current working directory\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6706 if ($output_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6707 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6708 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6710
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6711 if (chdir $output_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6712 $output_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6713 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6714 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6715 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6716 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6717 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6718 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6719 warn "Created output directory $output_dir!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6720 chdir $output_dir or die "Failed to move to $output_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6721 $output_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6722 unless ($output_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6723 $output_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6724 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6725 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6726 warn "Output will be written into the directory: $output_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6727 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6728 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6729 $output_dir = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6730 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6731
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6732 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6733
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6734 chdir $parent_dir or die "Failed to move back to current working directory\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6735 if ($temp_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6736 warn "\nUsing temp directory: $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6737 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6738 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6739 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6740
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6741 if (chdir $temp_dir){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6742 $temp_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6743 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6744 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6745 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6746 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6747 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6748 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6749 warn "Created temporary directory $temp_dir!\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6750 chdir $temp_dir or die "Failed to move to $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6751 $temp_dir = getcwd; # making the path absolute
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6752 unless ($temp_dir =~ /\/$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6753 $temp_dir =~ s/$/\//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6754 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6755 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6756 warn "Temporary files will be written into the directory: $temp_dir\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6757 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6758 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6759 $temp_dir = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6760 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6762 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6763 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6764 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6766 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6767 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6768
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6769 ### PREFIX FOR OUTPUT FILES
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6770 if ($prefix){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6771 # removing trailing dots
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6772
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6773 $prefix =~ s/\.+$//;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6774
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6775 warn "Using the following prefix for output files: $prefix\n\n";
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6776 sleep(1);
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6777 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6778
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6779
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag);
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6782
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6783
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6784
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6785 sub generate_SAM_header{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6786 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6787 foreach my $chr (keys %chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6788 my $length = length ($chromosomes{$chr});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6789 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6790 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6791 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6792 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6793
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6794 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6795 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6796
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6797 sub single_end_SAM_output{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6798 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6799 my $strand = $methylation_call_params->{$id}->{alignment_strand};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6800 my $chr = $methylation_call_params->{$id}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6801 my $start = $methylation_call_params->{$id}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6802 my $stop = $methylation_call_params->{$id}->{end_position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6803 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6804 my $methcall = $methylation_call_params->{$id}->{methylation_call};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6805 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6806 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6807 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6808 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6809 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6810 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6811 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6812 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6813 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6814
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6815 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6816 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6817 ## Bit Description Comment Value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6818 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6819 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6820 ## 0x4 segment unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6821 ## 0x8 next segment in the template unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6822 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6823 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6824 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6825 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6826 ## 0x100 secondary alignment --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6827 ## 0x200 not passing quality controls --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6828 ## 0x400 PCR or optical duplicate --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6829
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6830 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6831
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6832 my $flag; # FLAG variable used for SAM format.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6833 if ($strand eq "+"){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6834 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6835 $flag = 0; # 0 for "+" strand (OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6836 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6837 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6838 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6839 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6840 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6841 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6842 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6843 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6844 elsif ($strand eq "-"){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6845 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6846 $flag = 16; # 16 for "-" strand (OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6847 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6848 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6849 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6850 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6851 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6852 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6853 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6854 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6855 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6856 die "Unexpected strand information: $strand\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6857 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6858
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6859 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6860
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6861 my $mapq = 255; # Assume mapping quality is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6862
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6863 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6864
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6865 my $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6866 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6867 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6868 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6869 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6870 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6871 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6872
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6873 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6874
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6875 my $rnext = "*"; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6876
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6877 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6878
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6879 my $pnext = 0; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6880
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6881 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6882
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6883 my $tlen = 0; # Paired-end variable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6884
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6885 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6886
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6887 if ($read_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6888 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6889 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6890 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6891 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6892 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6893
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6894 if ($strand eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6895 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6896 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6897 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6898 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6900 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6901
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6902 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6903 # into the reference string. hemming_dist()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6904 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6905 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6906 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6907
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6908 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6909
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6910 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6911
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6912 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6913
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6914 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6915
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6916 my $XM_tag; # Optional tag XM: Methylation Call String
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6917 if ($strand eq '+'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6918 $XM_tag = "XM:Z:$methcall";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6919 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6920 elsif ($strand eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6921 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6922 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6923
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6924 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6925
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6926 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6927
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6928 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6929
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6930 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6931
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6932 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6933
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6934 # Optionally calculating number of mismatches for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6935
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6936 if ($non_bs_mm) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6937 if ($bowtie2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6938
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6939 $number_of_mismatches =~ s/-//; # removing the minus sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6940
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6941 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6942 if ($cigar =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6943 # warn "$cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6944
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6945 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6946 my @len = split (/\D+/,$cigar); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6947 my @ops = split (/\d+/,$cigar); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6948 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6949 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6950
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6951 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6952 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6953 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6954 next; # irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6955 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6956 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6957 $number_of_mismatches -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6958 $number_of_mismatches -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6959 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6960 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6961 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6962 $number_of_mismatches -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6963 $number_of_mismatches -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6964 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6965 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6966 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6967 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6968 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6969 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6970 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6971 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6972 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6973 # warn "Alignment score $number_of_mismatches\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6974 # print "Mismatches $number_of_mismatches\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6975 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6976 ### Now we have InDel corrected alignment scores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6977
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6978 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6979 ### sequence contained more than 5 Ns, but this should occur close to never
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6980
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6981 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6982 # warn "N count: $seq_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6983 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6984 # warn "MM $number_of_mismatches\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6985 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6986 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6987
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6988 ####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6989
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6990 my $XA_tag = "XA:Z:$number_of_mismatches";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6991
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6992 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6993
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6994 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6995 ### optionally print number of non-bisulfite mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6996 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6997 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6998 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6999 else{ # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7000 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7001 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7002 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7003 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7004
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7005 sub paired_end_SAM_output{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7006 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7007 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7008 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7009 my $chr = $methylation_call_params->{$id}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7010 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7011 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7012 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7017
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7018 my $id_1;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7019 my $id_2;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7020
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7021 if ($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7022 $id_1 = $id.'/1';
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7023 $id_2 = $id.'/2';
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7024 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7025 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7027 $id_2 = $id;
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7028 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7029
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7030 # Allows all degenerate nucleotide sequences in reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7033
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7034 my $index; # used to store the srand origin of the alignment in a less convoluted way
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7035
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7036 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7037 $index = 0; ## this is OT (original top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7038 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7039 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7040 $index = 1; ## this is CTOB (complementary to OB)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7041 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7042 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7043 $index = 2; ## this is CTOT (complementary to OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7044 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7045 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7046 $index = 3; ## this is OB (original bottom)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7047 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7048 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7049 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7050 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7051
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7052 my $number_of_mismatches_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7053 my $number_of_mismatches_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7054
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7055 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7056 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7057 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7058 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7059 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7060 if ($index == 2 or $index == 3){ # CTOT or OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7061 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7062 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7063 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7064 else{ # if the first read aligned in forward direction it is like for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7065 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7066 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7067 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7068 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7069
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7070
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7071
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7072 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7073 ### first or last position.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7074
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7075 if ($index == 0 or $index == 3){ # OT or OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7076 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7077 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7078 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7079 else{ # CTOT or CTOB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7080 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7081 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7082 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7083
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7084 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7085
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7086 my $start_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7087 my $start_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7088 # adjusting end positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7089
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7090 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7091 $start_read_1 = $methylation_call_params->{$id}->{position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7092 $start_read_2 = $methylation_call_params->{$id}->{position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7093 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7094 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7095 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7096 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7097 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7098 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7099 else{ # read 1 is on the - strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7100 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7101 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7102 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7104
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7105 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7106
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7107 my $end_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7108 my $end_read_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7109 # adjusting end positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7110
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7111 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7112 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7113 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7114 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7115 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7116 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7117 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7118 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7119 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7120 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7121 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7122 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7123 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7124 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7125
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7126 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7127
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7128 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7129 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7130 ## Bit Description Comment Value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7131 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7132 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7133 ## 0x4 segment unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7134 ## 0x8 next segment in the template unmapped --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7135 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7136 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7137 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7138 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7139 ## 0x100 secondary alignment --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7140 ## 0x200 not passing quality controls --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7141 ## 0x400 PCR or optical duplicate --- ---
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7142
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7144
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7147
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7148 my $flag_1; # FLAG variable used for SAM format
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7149 my $flag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7150
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7152
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7153 if ($index == 0){ # OT
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7154 unless ($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7157 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7158 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7161 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7162 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7163 elsif ($index == 1){ # CTOB
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7164 unless($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7167 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7168 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7171 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7172 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7173 elsif ($index == 2){ # CTOT
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7174 unless ($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7177 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7178 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7181 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7182 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7183 elsif ($index == 3){ # OB
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7184 unless ($old_flag){
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7187 }
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7188 else{
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7191 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7193
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7194 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7195
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7196 my $mapq = 255; # Mapping quality is unavailable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7197
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7198 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7199
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7200 my $cigar_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7201 my $cigar_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7202
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7203 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7204 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7205 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7206 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7207 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7208 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7209 $cigar_2 = length($actual_seq_2) . "M";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7210 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7211
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7212 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7213
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7214 my $rnext = '='; # Chromosome of mate; applies to both reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7215
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7216 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7217
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7218 my $pnext_1 = $start_read_2; # Leftmost position of mate
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7219 my $pnext_2 = $start_read_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7220
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7221 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7222
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7223 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7224 my $tlen_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7225
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7226 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7227
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7228 if ($start_read_1 <= $start_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7229
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7230 # Read 1 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7231
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7232 if ($end_read_2 >= $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7233
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7234 # -------------------------> read 1 reads overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7235 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7236 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7237 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7238 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7239 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7240 # <----------------------- read 2 read 2 contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7241 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7242 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7243 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7244 # -------------------------> read 1 reads 1 and 2 exactly overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7245 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7246 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7247
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7248 # dovetailing of reads is not enabled for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7249
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7250 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7251 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7252 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7253 elsif ($end_read_2 < $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7254
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7255 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7256 # <----------- read 2 read 2 contained within read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7257 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7258 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7259 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7260 # -------------------------> read 1
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7261 # <------------------------ read 2 read 2 contained within read 1
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7262
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7267 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7268
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7270
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7271 elsif ($start_read_2 < $start_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7272
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7273 if ($end_read_1 >= $end_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7274
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7275 # Read 2 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7276
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7277 # -------------------------> read 2 reads overlapping
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7278 # <------------------------- read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7279 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7280 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7281 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7282 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7283 # <----------------------- read 1 read 1 contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7284 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7285 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7287 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7288 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7290 elsif ($end_read_1 < $end_read_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7291
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7292 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7293 # <----------- read 1 read 1 contained within read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7294 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7295 # or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7296 #
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7297 # -------------------------> read 2
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7298 # <------------------------ read 1 read 1 contained within read 2
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7299
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7304 }
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7306 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7307
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7308 else{ # Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7309
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7310 if ($end_read_2 >= $end_read_1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7311 # Read 1 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7312 # -------------------------> read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7313 # <------------------------- read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7314 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7315
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7316 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7317 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7318 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7319 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7320 # Read 2 alignment is leftmost
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7321 # -------------------------> read 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7322 # <------------------------- read 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7323 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7324
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7325 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7326 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7327 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7328 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7329
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7330 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7331
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7332 # adjusting the strand of the sequence before we use them to generate mismatch strings
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7333 if ($strand_1 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7334 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7335 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7336 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7337 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7338 if ($strand_2 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7339 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7340 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7341 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7342 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7343
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7344 # print "$actual_seq_1\n$ref_seq_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7345 # print "$actual_seq_2\n$ref_seq_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7346
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7347 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7348
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7349 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7350 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7351 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7352 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7353 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7354 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7355 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7356 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7358 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7359
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7360 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7361 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7362
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7363 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7364
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7365 my $XM_tag_1; # Optional tag XM: Methylation call string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7366 my $XM_tag_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7367
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7368 if ($strand_1 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7369 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7370 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7371 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7372 $XM_tag_1 = "XM:Z:$methcall_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7374
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7375 if ($strand_2 eq '-'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7376 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7377 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7378 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7379 $XM_tag_2 = "XM:Z:$methcall_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7380 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7381
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7382 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7383
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7384 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7385 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7386
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7387 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7388
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7389 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7390
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7391 #####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7392
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7393 # Optionally calculating number of mismatches for Bowtie 2 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7394
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7395 if ($non_bs_mm) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7396 if ($bowtie2) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7397
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7398 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7399 $number_of_mismatches_2 =~ s/-//;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7401 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7402
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7403 ### CIGAR 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7404 if ($cigar_1 =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7405 # warn "$cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7406
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7407 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7408 my @len = split (/\D+/,$cigar_1); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7409 my @ops = split (/\d+/,$cigar_1); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7410 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7411 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7412
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7413 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7414 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7415 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7416 next; # irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7417 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7418 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7419 $number_of_mismatches_1 -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7420 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7421 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7423 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7424 $number_of_mismatches_1 -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7425 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7426 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7427 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7428 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7429 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7430 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7431 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7432 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7433 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7434 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7435
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7436 # warn "Alignment score $number_of_mismatches_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7437 # print "Mismatches $number_of_mismatches_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7439
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7440 ### CIGAR 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7441 if ($cigar_2 =~ /(D|I)/) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7442 # warn "$cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7443
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7444 # parsing CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7445 my @len = split (/\D+/,$cigar_2); # storing the length per operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7446 my @ops = split (/\d+/,$cigar_2); # storing the operation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7447 shift @ops; # remove the empty first element
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7448 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7449
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7450 foreach (0..$#len) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7451 if ($ops[$_] eq 'M') {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7452 # warn "skipping\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7453 next; #irrelevant
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7454 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7455 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7456 $number_of_mismatches_2 -= $insertion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7457 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7458 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7459 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7460 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7461 $number_of_mismatches_2 -= $deletion_open;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7462 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7463 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7464 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7465 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7466 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7467 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7468 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7469 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7471 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7472 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7473
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7474 ### Now we have InDel corrected Alignment scores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7475
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7476 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7477 ### sequence contained more than 5 Ns, but this should occur close to never
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7478
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7479 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7480 my $seq_2_N_count = $number_of_mismatches_2 % 6;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7481 # warn "N count 1: $seq_1_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7482 # warn "N count 2: $seq_2_N_count\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7483
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7484 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7485 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7486
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7487 # warn "MM1 $number_of_mismatches_1 \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7488 # warn "MM2 $number_of_mismatches_2 \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7489 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7490 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7491
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7492 ####
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7493
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7494 my $XA_tag = "XA:Z:$number_of_mismatches_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7495 my $XB_tag = "XB:Z:$number_of_mismatches_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7496
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7497
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7498 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7499 ### optionally print number of non-bisulfite mismatches
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7500 if ($non_bs_mm){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7501 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7502 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7503 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7504 else{ # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7505 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7506 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7507 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7508 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7509
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7510 sub revcomp{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7511 my $seq = shift or die "Missing seq to reverse complement\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7512 $seq = reverse $seq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7513 $seq =~ tr/ACTGactg/TGACTGAC/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7514 return $seq;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7516
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7517 sub hemming_dist{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7518 my $matches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7519 my @actual_seq = split //,(shift @_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7520 my @ref_seq = split //,(shift @_);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7521 foreach (0..$#actual_seq){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7522 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7523 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7524 return my $hd = scalar @actual_seq - $matches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7526
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7527 sub make_mismatch_string{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7528 my $actual_seq = shift or die "Missing actual sequence";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7529 my $ref_seq = shift or die "Missing reference sequence";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7530 my $XX_tag = "XX:Z:";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7531 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7532 my $prev_mm_pos = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7533 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7534 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7535 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7536 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7537 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7538 $prev_mm_pos = pos($tmp); # Position of last mismatch
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7540 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7541 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7542 return $XX_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7543 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7544
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7545
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7546
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7547 sub print_helpfile{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7548 print << "HOW_TO";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7549
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7551 This program is free software: you can redistribute it and/or modify
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7552 it under the terms of the GNU General Public License as published by
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7553 the Free Software Foundation, either version 3 of the License, or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7554 (at your option) any later version.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7555
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7556 This program is distributed in the hope that it will be useful,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7557 but WITHOUT ANY WARRANTY; without even the implied warranty of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7558 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7559 GNU General Public License for more details.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7560 You should have received a copy of the GNU General Public License
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7561 along with this program. If not, see <http://www.gnu.org/licenses/>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7562
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7563
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7564
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7565 DESCRIPTION
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7566
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7567
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7568 The following is a brief description of command line options and arguments to control the Bismark
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7569 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7570 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7571 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7572 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7573 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7574 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7575 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7576 sequence from the genome and determine if there were any protected C's present or not.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7577
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7578 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7579 re-enabled by using --non_directional.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7580
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7581 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7582 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7583 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7584
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7585
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7586 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7587
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7588
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7589 ARGUMENTS:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7590
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7591 <genome_folder> The path to the folder containing the unmodified reference genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7592 as well as the subfolders created by the Bismark_Genome_Preparation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7593 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7594 Bismark expects one or more fastA files in this folder (file extension: .fa
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7595 or .fasta). The path can be relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7596
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7597 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7598 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7599 correspond file-for-file and read-for-read with those specified in <mates2>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7600 Reads may be a mix of different lengths. Bismark will produce one mapping result
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7601 and one report file per paired-end input file pair.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7602
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7603 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7604 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7605 correspond file-for-file and read-for-read with those specified in <mates1>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7606 Reads may be a mix of different lengths.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7607
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7608 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7609 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7610 produce one mapping result and one report file per input file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7611
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7612
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7613 OPTIONS:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7614
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7615
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7616 Input:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7617
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7618 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7619 files (usually having extension .fg or .fastq). This is the default. See also
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7620 --solexa-quals.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7621
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7622 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7623 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7624 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7625 the read name and the sequence on a single line (and not spread over several lines).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7626
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7627 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7628
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7629 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7630
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7631 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7632
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7633 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7634
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7635 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7636 (which can't). The formula for conversion is:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7637 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7638 is usually the right option for use with (unconverted) reads emitted by the GA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7639 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7640
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7641 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7642 reads emitted by GA Pipeline version 1.3 or later. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7643
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7644 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7645 specified it is assumed that Bowtie (1 or 2) is in the PATH.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7646
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7647
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7648 Alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7649
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7650 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7651 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7652 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7653
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7654 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7655 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7656 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7657
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7658 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7659 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7660 quality values to the nearest 10 and saturates at 30. This value is not relevant for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7661 Bowtie 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7662
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7663 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7664 --best mode. Best-first search must keep track of many paths at once to ensure it is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7665 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7666 memory impact of the descriptors, but they can still grow very large in some cases. If
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7667 you receive an error message saying that chunk memory has been exhausted in --best mode,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7668 try adjusting this parameter up to dedicate more memory to the descriptors. This value
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7669 is not relevant for Bowtie 2. Default: 512.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7670
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7671 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7672 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7673 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7674 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7675
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7676 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7677 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7678 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7679 A 61-bp gap would not be valid in that case. Default: 500.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7680
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7681
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7682 Bowtie 1 Reporting:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7683
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7684 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7685 will be used by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7686
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7687 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7688 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7689 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7690 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7691 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7692 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7693 the best alignment). --best mode also removes all strand bias. Note that --best does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7694 affect which alignments are considered "valid" by Bowtie, only which valid alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7695 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7696 Default: on.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7697
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7698 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7699 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7700
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7701
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7702 Output:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7703
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7704 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7705 bisulfite strands will be reported. Default: OFF.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7706
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7707 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7708 to the original strands are merely theoretical and should not exist in reality. Specifying directional
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7709 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7710 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7711 for sprand-specific libraries).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7712
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7713 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7714 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7715 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7716 and OB ones. Use this option only if you are certain that your libraries were constructed following
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7717 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7718 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7719 temporary files only).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7720
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7721 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7722 split up into several smaller files to run concurrently and the output files are to be merged.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7723
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7724 --quiet Print nothing besides alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7725
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7726 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7727 of SAM format output.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7728
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7729 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7730 appear as they did in the input, without any translation of quality values that may have
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7731 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7732 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7733 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7734 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7735
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7736 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7737 mismatches or other reads that fail to align uniquely to a file in the output directory.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7738 Written reads will appear as they did in the input, without any of the translation of quality
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7739 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7740 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7741 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7742
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7743 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7744 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7745 to create it first. The path to the output folder can be either relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7746
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7747 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7748 the specified folder does not exist, Bismark will attempt to create it first. The path to the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7749 temporary folder can be either relative or absolute.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7750
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7751 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7752 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7753 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7754 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7755 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7756
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7757 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7758 space. This option is available for most alignment modes but is not available for paired-end FastA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7759 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7760 further testing.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7762 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7763 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7764 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7766
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7768 explicitly if Samtools is in the PATH already.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7769
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7772
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7776 such as Picard, new defaults were implemented as of version 0.8.3.
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7777
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7778
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7779 default old_flag
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7780 =================== ===================
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7781 Read 1 Read 2 Read 1 Read 2
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7782
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7783 OT: 99 147 67 131
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7784
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7785 OB: 83 163 115 179
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7786
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7787 CTOT: 99 147 67 131
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7788
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7789 CTOB: 83 163 115 179
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7790
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7791
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7792
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7793 Other:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7794
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7795 -h/--help Displays this help file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7796
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7797 -v/--version Displays version information.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7799
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7800 BOWTIE 2 SPECIFIC OPTIONS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7801
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7802 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7803 alignments, i.e. searches for alignments involving all read characters (also called
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7804 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7805 and/or quality trimmed where appropriate. Default: off.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7806
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7807 Bowtie 2 alignment options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7808
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7809 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7810 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7811 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7812 Bowtie 1 see -n).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7813
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7814 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7815 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7816 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7817 Bowtie 1 see -l).
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7818
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7819 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7820 position to be the highest possible, regardless of the actual value. I.e. input is treated
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7821 as though all quality values are high. This is also the default behavior when the input
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7822 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7823
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7824
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7825 Bowtie 2 paired-end options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7826
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7827 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7828 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7829 and on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7830
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7831 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7832 A discordant alignment is an alignment where both mates align uniquely, but that does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7833 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7834 and it is on by default.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7835
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7836
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7837 Bowtie 2 effort options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7838
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7839 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7840 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7841 new second-best alignment. Default: 15.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7842
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7843 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7844 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7845 mismatches allowed) at different offsets and searches for more alignments. A read is considered
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7846 to have repetitive seeds if the total number of seed hits divided by the number of seeds
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7847 that aligned at least once is greater than 300. Default: 2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7848
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7849 Bowtie 2 parallelization options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7850
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7851
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7852 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7853 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7854 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7855 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7856 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7857 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7858 automatically use the option '--reorder', which guarantees that output SAM records are printed in
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7859 an order corresponding to the order of the reads in the original input file, even when -p is set
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7860 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7861 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7862 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7863 correspond to input order in that case.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7864
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7865 Bowtie 2 Scoring options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7866
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7867 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7868 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7869 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7870 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7871 L,0,-0.2.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7872
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7873 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7874 of <int1> + N * <int2>. Default: 5, 3.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7875
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7876 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7877 a penalty of <int1> + N * <int2>. Default: 5, 3.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7878
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7879
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7880 Bowtie 2 Reporting options:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7881
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7882 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7883 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7884 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7885 effort expended to find valid alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7886
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7887 For reference, this used to be the old (now deprecated) description of -M:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7888 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7889 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7890 happens first. Only the best alignment is reported. Information from the other alignments is used to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7891 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7892 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7893 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7894 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7895 always used and its default value is set to 10.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7896
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7897
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7898 'VANILLA' Bismark OUTPUT:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7900 Single-end output format (tab-separated):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7901
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7902 (1) <seq-ID>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7903 (2) <read alignment strand>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7904 (3) <chromosome>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7905 (4) <start position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7906 (5) <end position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7907 (6) <observed bisulfite sequence>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7908 (7) <equivalent genomic sequence>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7909 (8) <methylation call>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7910 (9) <read conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7911 (10) <genome conversion>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7912 (11) <read quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7913
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7914
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7915 Paired-end output format (tab-separated):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7916 (1) <seq-ID>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7917 (2) <read 1 alignment strand>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7918 (3) <chromosome>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7919 (4) <start position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7920 (5) <end position>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7921 (6) <observed bisulfite sequence 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7922 (7) <equivalent genomic sequence 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7923 (8) <methylation call 1>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7924 (9) <observed bisulfite sequence 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7925 (10) <equivalent genomic sequence 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7926 (11) <methylation call 2>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7927 (12) <read 1 conversion
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7928 (13) <genome conversion>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7929 (14) <read 1 quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7930 (15) <read 2 quality score (Phred33)>
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7931
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7932
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7933 Bismark SAM OUTPUT (default):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7934
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7935 (1) QNAME (seq-ID)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7936 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7937 (3) RNAME (chromosome)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7938 (4) POS (start position)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7939 (5) MAPQ (always 255)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7940 (6) CIGAR
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7941 (7) RNEXT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7942 (8) PNEXT
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7943 (9) TLEN
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7944 (10) SEQ
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7945 (11) QUAL (Phred33 scale)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7946 (12) NM-tag (edit distance to the reference)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7947 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7948 (14) XM-tag (methylation call string)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7949 (15) XR-tag (read conversion state for the alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7950 (16) XG-tag (genome conversion state for the alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7952
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7953 Each read of paired-end alignments is written out in a separate line in the above format.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7954
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7955
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
7956 Last edited on 07 October 2013.
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7957
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7958 HOW_TO
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7959 }