Mercurial > repos > hammock > hammock
comparison external_tools/linux/lib/hh/scripts/Align.pm @ 6:2277dd59b9f9 draft
Uploaded
| author | hammock |
|---|---|
| date | Wed, 01 Nov 2017 05:54:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:b7652b7c97bd | 6:2277dd59b9f9 |
|---|---|
| 1 # Package Align.pl | |
| 2 # (c) Johannes Soeding, 2006 | |
| 3 # Perl functions for Smith-Waterman and Needleman-Wunsch sequence alignment | |
| 4 | |
| 5 # HHsuite version 2.0 | |
| 6 # | |
| 7 # Reference: | |
| 8 # Remmert M., Biegert A., Hauser A., and Soding J. | |
| 9 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
| 10 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
| 11 | |
| 12 # (C) Johannes Soeding and Michael Remmert, 2012 | |
| 13 | |
| 14 # This program is free software: you can redistribute it and/or modify | |
| 15 # it under the terms of the GNU General Public License as published by | |
| 16 # the Free Software Foundation, either version 3 of the License, or | |
| 17 # (at your option) any later version. | |
| 18 | |
| 19 # This program is distributed in the hope that it will be useful, | |
| 20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 22 # GNU General Public License for more details. | |
| 23 | |
| 24 # You should have received a copy of the GNU General Public License | |
| 25 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 26 | |
| 27 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
| 28 | |
| 29 ############################################################################# | |
| 30 # Subroutine AlignSW | |
| 31 # Smith-Waterman local alignment | |
| 32 # usage: | |
| 33 # 1. Use global variables of package Align.pm: | |
| 34 # $score = &AlignSW(); | |
| 35 # printf(" XSEQ: $Align::xseq\n"); | |
| 36 # printf(" MATCH: $Align::Sstr\n"); | |
| 37 # printf(" YSEQ: $Align::yseq\n"); | |
| 38 # etc. | |
| 39 # | |
| 40 # 2. Use references and/or global variables | |
| 41 # $score = &AlignSW(\$xseq,\$yseq); | |
| 42 # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S); | |
| 43 # printf(" XSEQ: $xseq\n"); | |
| 44 # printf(" MATCH: $Sstr\n"); | |
| 45 # printf(" YSEQ: $yseq\n"); | |
| 46 # | |
| 47 # Input: $xseq, $yseq : sequences x and y as strings | |
| 48 # Param: $main::d : gap opening penalty | |
| 49 # $main::e : gap extension penalty | |
| 50 # Output: return value : bit score | |
| 51 # $xseq, $yseq : aligned residues of x and y (with - as gap) | |
| 52 # @i : $i[$col],$j[$col] are aligned residues in column $col | |
| 53 # @j : (first is 1 (NOT 0!), 0 means gap) | |
| 54 # $imin : first aligned residue of sequence x | |
| 55 # $imax : last aligned residue of sequence x | |
| 56 # $jmin : first aligned residue of sequence y | |
| 57 # $jmax : last aligned residue of sequence y | |
| 58 # $Sstr : string belonging to $xseq and $yseq showing quality of alignment | |
| 59 # $S[$col] : match score for aligning positions $i[$col] and $j[$col] | |
| 60 ############################################################################# | |
| 61 | |
| 62 ############################################################################# | |
| 63 # Subroutine AlignNW | |
| 64 # Needleman-Wunsch global alignment | |
| 65 # usage: $score = &AlignNW(); | |
| 66 # $score = &AlignNW(\$xseq,\$yseq); | |
| 67 # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j); | |
| 68 # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S); | |
| 69 # | |
| 70 # Input: $xseq, $yseq : sequences x and y as strings | |
| 71 # Param: $main::d : gap opening penalty | |
| 72 # $main::e : gap extension penalty | |
| 73 # $main::g : end gap penalty | |
| 74 # Output: return value : bit score | |
| 75 # $xseq, $yseq : aligned residues of x and y (with - as gap) | |
| 76 # @i : $i[$col],$j[$col] are aligned residues in column $col | |
| 77 # @j : (first is 1 (NOT 0!), 0 means gap) | |
| 78 # $imin : first aligned residue of sequence x | |
| 79 # $imax : last aligned residue of sequence x | |
| 80 # $jmin : first aligned residue of sequence y | |
| 81 # $jmax : last aligned residue of sequence y | |
| 82 # $Sstr : string belonging to $xseq and $yseq showing quality of alingment | |
| 83 # $S[$col] : match score for aligning positions $i[$col] and $j[$col] | |
| 84 ############################################################################# | |
| 85 | |
| 86 package Align; | |
| 87 | |
| 88 use strict; | |
| 89 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); | |
| 90 use Exporter; | |
| 91 our @ISA = qw(Exporter); | |
| 92 our @EXPORT = qw(&AlignSW &AlignNW $matrix); | |
| 93 | |
| 94 our $xseq; # first sequence | |
| 95 our $yseq; # second sequence | |
| 96 our $ri; # reference to input array: $i[$col] -> $ri->[$col] | |
| 97 our $rj; # reference to input array: $j[$col] -> $rj->[$col] | |
| 98 our $imin; # first aligned residue of sequence x | |
| 99 our $imax; # last aligned residue of sequence x | |
| 100 our $jmax; # first aligned residue of sequence y | |
| 101 our $jmin; # last aligned residue of sequence y | |
| 102 our $Sstr; # $Sstr annotates the match quality | |
| 103 our $rS; # reference $rS->[$col] -> $S[$col] = match score for aligning positions $i[$col] and $j[$col] | |
| 104 our $matrix; | |
| 105 | |
| 106 my $firstcall=1; | |
| 107 my @Sab; # Substitution matrix in bit | |
| 108 # A B C D E F G H I J K L M N O P Q R S T U V W X Y Z | |
| 109 my @ch2i=( 0, 3, 4, 3, 6,13, 7, 8, 9,20,11,10,12, 2,20,14, 5, 1,15,16, 4,19,17,20,18, 6); | |
| 110 my @Gonnet = ( | |
| 111 # A R N D C Q E G H I L K M F P S T W Y V X | |
| 112 # The Gonnet matrix is in units of 10*log10() | |
| 113 [ 2.4,-0.6,-0.3,-0.3, 0.5,-0.2, 0.0, 0.5,-0.8,-0.8,-1.2,-0.4,-0.7,-2.3, 0.3, 1.1, 0.6,-3.6,-2.2, 0.1,-1.0,-9.9], # A | |
| 114 [-0.6, 4.7, 0.3,-0.3,-2.2, 1.5, 0.4,-1.0, 0.6,-2.4,-2.2, 2.7,-1.7,-3.2,-0.9,-0.2,-0.2,-1.6,-1.8,-2.0,-1.0,-9.9], # R | |
| 115 [-0.3, 0.3, 3.8, 2.2,-1.8, 0.7, 0.9, 0.4, 1.2,-2.8,-3.0, 0.8,-2.2,-3.1,-0.9, 0.9, 0.5,-3.6,-1.4,-2.2,-1.0,-9.9], # N | |
| 116 [-0.3,-0.3, 2.2, 4.7,-3.2, 0.9, 2.7, 0.1, 0.4,-3.8,-4.0, 0.5,-3.0,-4.5,-0.7, 0.5, 0.0,-5.2,-2.8,-2.9,-1.0,-9.9], # D | |
| 117 [ 0.5,-2.2,-1.8,-3.2,11.5,-2.4,-3.0,-2.0,-1.3,-1.1,-1.5,-2.8,-0.9,-0.8,-3.1, 0.1,-0.5,-1.0,-0.5, 0.0,-1.0,-9.9], # C | |
| 118 [-0.2, 1.5, 0.7, 0.9,-2.4, 2.7, 1.7,-1.0, 1.2,-1.9,-1.6, 1.5,-1.0,-2.6,-0.2, 0.2, 0.0,-2.7,-1.7,-1.5,-1.0,-9.9], # Q | |
| 119 [ 0.0, 0.4, 0.9, 2.7,-3.0, 1.7, 3.6,-0.8, 0.4,-2.7,-2.8, 1.2,-2.0,-3.9,-0.5, 0.2,-0.1,-4.3,-2.7,-1.9,-1.0,-9.9], # E | |
| 120 [ 0.5,-1.0, 0.4, 0.1,-2.0,-1.0,-0.8, 6.6,-1.4,-4.5,-4.4,-1.1,-3.5,-5.2,-1.6, 0.4,-1.1,-4.0,-4.0,-3.3,-1.0,-9.9], # G | |
| 121 [-0.8, 0.6, 1.2, 0.4,-1.3, 1.2, 0.4,-1.4, 6.0,-2.2,-1.9, 0.6,-1.3,-0.1,-1.1,-0.2,-0.3,-0.8,-2.2,-2.0,-1.0,-9.9], # H | |
| 122 [-0.8,-2.4,-2.8,-3.8,-1.1,-1.9,-2.7,-4.5,-2.2, 4.0, 2.8,-2.1, 2.5, 1.0,-2.6,-1.8,-0.6,-1.8,-0.7, 3.1,-1.0,-9.9], # I | |
| 123 [-1.2,-2.2,-3.0,-4.0,-1.5,-1.6,-2.8,-4.4,-1.9, 2.8, 4.0,-2.1, 2.8, 2.0,-2.3,-2.1,-1.3,-0.7, 0.0, 1.8,-1.0,-9.9], # L | |
| 124 [-0.4, 2.7, 0.8, 0.5,-2.8, 1.5, 1.2,-1.1, 0.6,-2.1,-2.1, 3.2,-1.4,-3.3,-0.6, 0.1, 0.1,-3.5,-2.1,-1.7,-1.0,-9.9], # K | |
| 125 [-0.7,-1.7,-2.2,-3.0,-0.9,-1.0,-2.0,-3.5,-1.3, 2.5, 2.8,-1.4, 4.3, 1.6,-2.4,-1.4,-0.6,-1.0,-0.2, 1.6,-1.0,-9.9], # M | |
| 126 [-2.3,-3.2,-3.1,-4.5,-0.8,-2.6,-3.9,-5.2,-0.1, 1.0, 2.0,-3.3, 1.6, 7.0,-3.8,-2.8,-2.2, 3.6, 5.1, 0.1,-1.0,-9.9], # F | |
| 127 [ 0.3,-0.9,-0.9,-0.7,-3.1,-0.2,-0.5,-1.6,-1.1,-2.6,-2.3,-0.6,-2.4,-3.8, 7.6, 0.4, 0.1,-5.0,-3.1,-1.8,-1.0,-9.9], # P | |
| 128 [ 1.1,-0.2, 0.9, 0.5, 0.1, 0.2, 0.2, 0.4,-0.2,-1.8,-2.1, 0.1,-1.4,-2.8, 0.4, 2.2, 1.5,-3.3,-1.9,-1.0,-1.0,-9.9], # S | |
| 129 [ 0.6,-0.2, 0.5, 0.0,-0.5, 0.0,-0.1,-1.1,-0.3,-0.6,-1.3, 0.1,-0.6,-2.2, 0.1, 1.5, 2.5,-3.5,-1.9, 0.0,-1.0,-9.9], # T | |
| 130 [-3.6,-1.6,-3.6,-5.2,-1.0,-2.7,-4.3,-4.0,-0.8,-1.8,-0.7,-3.5,-1.0, 3.6,-5.0,-3.3,-3.5,14.2, 4.1,-2.6,-1.0,-9.9], # W | |
| 131 [-2.2,-1.8,-1.4,-2.8,-0.5,-1.7,-2.7,-4.0,-2.2,-0.7, 0.0,-2.1,-0.2, 5.1,-3.1,-1.9,-1.9, 4.1, 7.8,-1.1,-1.0,-9.9], # Y | |
| 132 [ 0.1,-2.0,-2.2,-2.9, 0.0,-1.5,-1.9,-3.3,-2.0, 3.1, 1.8,-1.7, 1.6, 0.1,-1.8,-1.0, 0.0,-2.6,-1.1, 3.4,-1.0,-9.9], # V | |
| 133 [-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,+1.0,-9.9], # X | |
| 134 [-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9] # ~ | |
| 135 ); | |
| 136 | |
| 137 # A R N D C Q E G H I L K M F P S T W Y V X | |
| 138 my @BLOSUM62 = ( | |
| 139 [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0, 0,-9], | |
| 140 [-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1,-9], | |
| 141 [-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-1,-9], | |
| 142 [-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-1,-9], | |
| 143 [ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-2,-9], | |
| 144 [-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-1,-9], | |
| 145 [-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-1,-9], | |
| 146 [ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-9], | |
| 147 [-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-1,-9], | |
| 148 [-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-1,-9], | |
| 149 [-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-1,-9], | |
| 150 [-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-1,-9], | |
| 151 [-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-1,-9], | |
| 152 [-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-1,-9], | |
| 153 [-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-9], | |
| 154 [ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0,-9], | |
| 155 [ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0, 0,-9], | |
| 156 [-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-2,-9], | |
| 157 [-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-1,-9], | |
| 158 [ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-1,-9], | |
| 159 [ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,+1,-9], | |
| 160 [-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9] | |
| 161 ); | |
| 162 | |
| 163 # print("Substitution matrix:\n"); | |
| 164 # for ($a=0; $a<=20; $a++) { | |
| 165 # for ($b=0; $b<=20; $b++) { | |
| 166 # printf("%6.1f ",$Sab[$a][$b]); | |
| 167 # } | |
| 168 # printf("\n"); | |
| 169 # } | |
| 170 | |
| 171 | |
| 172 # Set substitution matrix in bits (do only at first call of one of the alignment routines) | |
| 173 sub SetSubstitutionMatrix { | |
| 174 if ($firstcall) { | |
| 175 # Transform to bits; | |
| 176 if (defined($matrix) && $matrix eq "Gonnet") { | |
| 177 for (my $a=0; $a<=20; ++$a) { | |
| 178 for (my $b=0; $b<=20; ++$b) { | |
| 179 $Sab[$a][$b] = $Gonnet[$a][$b]*0.3322; # 1*log(10)/log(2); | |
| 180 } | |
| 181 } | |
| 182 } elsif (defined($matrix) && $matrix eq "Blosum62") { | |
| 183 {printf("Using Blosum62 matrix...\n");} | |
| 184 for (my $a=0; $a<=20; $a++) { | |
| 185 for (my $b=0; $b<=20; $b++) { | |
| 186 $Sab[$a][$b] = $BLOSUM62[$a][$b]; | |
| 187 } | |
| 188 } | |
| 189 } else { | |
| 190 for (my $a=0; $a<20; ++$a) { | |
| 191 for (my $b=0; $b<20; ++$b) { | |
| 192 $Sab[$a][$b] = -1; | |
| 193 } | |
| 194 $Sab[$a][$a] = 2; | |
| 195 } | |
| 196 for (my $b=0; $b<=20; ++$b) { | |
| 197 $Sab[20][$b] = $Sab[$b][20] = 0; | |
| 198 $Sab[21][$b] = $Sab[$b][21] = -10; | |
| 199 } | |
| 200 $Sab[20][20] = $Sab[20][20] = +1;# if in doubt, match X with X | |
| 201 } | |
| 202 | |
| 203 $firstcall=0; | |
| 204 } | |
| 205 } | |
| 206 | |
| 207 # maxbt(val1,...,valx,\$bt) finds maximum of values and puts index of maximum into $bt | |
| 208 sub maxbt { | |
| 209 my $rbt=pop @_; # last element of @_ is address of $bt | |
| 210 my $max = shift; | |
| 211 my $i=0; | |
| 212 $$rbt = 0; | |
| 213 foreach $_ (@_) { | |
| 214 $i++; | |
| 215 if ($_>$max) {$max=$_; $$rbt=$i;} | |
| 216 } | |
| 217 return $max; | |
| 218 } | |
| 219 | |
| 220 # max3bt(val1,val2,val3,\$bt) finds maximum of values and puts index of maximum into $bt | |
| 221 sub max3bt { | |
| 222 if ($_[1] < $_[0]) { | |
| 223 if ($_[2] < $_[0]) { | |
| 224 ${$_[3]}=0; | |
| 225 return $_[0]; | |
| 226 } else { | |
| 227 ${$_[3]}=2; | |
| 228 return $_[2]; | |
| 229 } | |
| 230 } else { | |
| 231 if ($_[2] < $_[1]) { | |
| 232 ${$_[3]}=1; | |
| 233 return $_[1]; | |
| 234 } else { | |
| 235 ${$_[3]}=2; | |
| 236 return $_[2]; | |
| 237 } | |
| 238 } | |
| 239 } | |
| 240 | |
| 241 # max2bt(val1,val2,\$bt) finds maximum of values and puts index of maximum into $bt | |
| 242 sub max2bt { | |
| 243 if ($_[1] < $_[0]) { | |
| 244 ${$_[2]}=0; | |
| 245 return $_[0]; | |
| 246 } else { | |
| 247 ${$_[2]}=1; | |
| 248 return $_[1]; | |
| 249 } | |
| 250 } | |
| 251 | |
| 252 | |
| 253 ############################################################################# | |
| 254 # Subroutien AlignSW | |
| 255 # Smith-Waterman local alignment | |
| 256 ############################################################################# | |
| 257 sub AlignSW { | |
| 258 if (@_>=1) {$xseq=$_[0];} | |
| 259 if (@_>=2) {$yseq=$_[1];} | |
| 260 if (@_>=3) {$ri=$_[2];} | |
| 261 if (@_>=4) {$rj=$_[3];} | |
| 262 if (@_>=5) {$imin=$_[4];} | |
| 263 if (@_>=6) {$imax=$_[5];} | |
| 264 if (@_>=7) {$jmin=$_[6];} | |
| 265 if (@_>=8) {$jmax=$_[7];} | |
| 266 if (@_>=9) {$Sstr=$_[8];} | |
| 267 if (@_>=10) {$rS=$_[9];} | |
| 268 | |
| 269 if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;} | |
| 270 if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;} | |
| 271 | |
| 272 my @xchr; # ASCII characters of $xseq | |
| 273 my @ychr; # ASCII characters of $yseq | |
| 274 my @xres; # internal integer representation of residues of x | |
| 275 my @yres; # internal integer representation of residues of y | |
| 276 | |
| 277 $$xseq =~ s/\s//g; | |
| 278 $$yseq =~ s/\s//g; | |
| 279 @xchr = split(//,$$xseq); | |
| 280 @ychr = split(//,$$yseq); | |
| 281 | |
| 282 my $Lx=@xchr; # length of sequence x | |
| 283 my $Ly=@ychr; # length of sequence y | |
| 284 my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state | |
| 285 my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x | |
| 286 my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y | |
| 287 my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B | |
| 288 my @Abt; # $Abt[a][b] = 0:A 1:M | |
| 289 my @Bbt; # $Bbt[a][b] = 0:B 1:M | |
| 290 my $score; # bit score of alignment | |
| 291 my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0) | |
| 292 my $state; # STOP:0 M:1 A:2 B:3 | |
| 293 my ($i, $j); # indices for sequence x and y, respectively | |
| 294 | |
| 295 my $dx = $main::dx; | |
| 296 my $dy = $main::dy; | |
| 297 if (! defined $dx) {$dx = $main::d;} | |
| 298 if (! defined $dy) {$dy = $main::d;} | |
| 299 | |
| 300 # Transform @xres and @yres to integer | |
| 301 for ($i=0; $i<@xchr; $i++) { | |
| 302 my $a=ord(uc($xchr[$i])); | |
| 303 if ($a<65 || $a>90) { | |
| 304 if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) { | |
| 305 printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]); | |
| 306 } | |
| 307 $xres[$i]=21; | |
| 308 } else { | |
| 309 $xres[$i]=$ch2i[$a-65]; | |
| 310 } | |
| 311 } | |
| 312 for ($j=0; $j<@ychr; $j++) { | |
| 313 my $a=ord(uc($ychr[$j])); | |
| 314 if ($a<65 || $a>90) { | |
| 315 if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) { | |
| 316 printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]); | |
| 317 } | |
| 318 $yres[$j]=21; | |
| 319 } else { | |
| 320 $yres[$j]=$ch2i[$a-65]; | |
| 321 } | |
| 322 } | |
| 323 unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element | |
| 324 unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element | |
| 325 | |
| 326 &SetSubstitutionMatrix; | |
| 327 | |
| 328 # Initialization | |
| 329 for ($i=0; $i<=$Lx; $i++) { | |
| 330 $M[$i][0]=-999; $A[$i][0]=-999; $B[$i][0]=-999; | |
| 331 } | |
| 332 for ($j=1; $j<=$Ly; $j++) { | |
| 333 $M[0][$j]=-999; $A[0][$j]=-999; $B[0][$j]=-999; | |
| 334 } | |
| 335 | |
| 336 # Iteration | |
| 337 for ($i=1; $i<=$Lx; ++$i) { | |
| 338 my $Mi =$M[$i]; | |
| 339 my $Mi1=$M[$i-1]; | |
| 340 my $Ai =$A[$i]; | |
| 341 my $Ai1=$A[$i-1]; | |
| 342 my $Bi =$B[$i]; | |
| 343 my $Bi1=$B[$i-1]; | |
| 344 my $Sabx=$Sab[$xres[$i]]; | |
| 345 my $j1=0; | |
| 346 for ($j=1; $j<=$Ly; ++$j, ++$j1) { | |
| 347 ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]]; | |
| 348 ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]); | |
| 349 ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]); | |
| 350 } | |
| 351 } | |
| 352 | |
| 353 # Finding maximum | |
| 354 $score = -1000; | |
| 355 for ($i=1; $i<=$Lx; $i++) { | |
| 356 my $Mi =$M[$i]; | |
| 357 for ($j=1; $j<=$Ly; $j++) { | |
| 358 if (${$Mi}[$j]>$score) {$score=${$Mi}[$j]; $$imax=$i; $$jmax=$j;} | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 # Backtracing | |
| 363 @$ri=(); | |
| 364 @$rj=(); | |
| 365 @$rS=(); | |
| 366 $state=1; # last state is M | |
| 367 $i=$$imax; $j=$$jmax; | |
| 368 $$xseq=""; $$yseq=""; | |
| 369 while ($state) { | |
| 370 if ($state==1) { | |
| 371 # current state is M (match-match) | |
| 372 unshift(@$ri,$i); | |
| 373 unshift(@$rj,$j); | |
| 374 $state = $Mbt[$i][$j]; | |
| 375 $$xseq=$xchr[$i].$$xseq; | |
| 376 $$yseq=$ychr[$j].$$yseq; | |
| 377 unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]); | |
| 378 $$imin=$i; $$jmin=$j; | |
| 379 $i--; $j--; | |
| 380 } elsif ($state==2) { | |
| 381 # current state is A (gap in x) | |
| 382 unshift(@$ri,0); | |
| 383 unshift(@$rj,$j); | |
| 384 $$xseq="-".$$xseq; | |
| 385 $$yseq=$ychr[$j].$$yseq; | |
| 386 $bt = $Abt[$i][$j--]; | |
| 387 if ($bt) { | |
| 388 # previous state was M | |
| 389 unshift(@$rS,-$dx); | |
| 390 $state = 1; | |
| 391 } else { | |
| 392 # previous state was A | |
| 393 unshift(@$rS,-$main::e); | |
| 394 } | |
| 395 } else { | |
| 396 # current state is B (gap in y) | |
| 397 unshift(@$ri,$i); | |
| 398 unshift(@$rj,0); | |
| 399 $$xseq=$xchr[$i].$$xseq; | |
| 400 $$yseq="-".$$yseq; | |
| 401 $bt = $Bbt[$i--][$j]; | |
| 402 if ($bt) { | |
| 403 # previous state was M | |
| 404 unshift(@$rS,-$dy); | |
| 405 $state = 1; | |
| 406 } else { | |
| 407 # previous state was B | |
| 408 unshift(@$rS,-$main::e); | |
| 409 } | |
| 410 } | |
| 411 } | |
| 412 | |
| 413 # Set annotation string representing match quality | |
| 414 $$Sstr=""; | |
| 415 for (my $col=0; $col<@$ri; $col++) { | |
| 416 if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) { | |
| 417 $$Sstr.=uc($xchr[$ri->[$col]]); | |
| 418 } elsif ($rS->[$col] > 0 ) { | |
| 419 $$Sstr.="+"; | |
| 420 } else { | |
| 421 $$Sstr.="."; | |
| 422 } | |
| 423 } | |
| 424 return $score; | |
| 425 } | |
| 426 | |
| 427 | |
| 428 ############################################################################# | |
| 429 # Subroutien AlignNW | |
| 430 # Needleman-Wunsch global alignment | |
| 431 ############################################################################# | |
| 432 sub AlignNW { | |
| 433 if (@_>=1) {$xseq=$_[0];} | |
| 434 if (@_>=2) {$yseq=$_[1];} | |
| 435 if (@_>=3) {$ri=$_[2];} | |
| 436 if (@_>=4) {$rj=$_[3];} | |
| 437 if (@_>=5) {$imin=$_[4];} | |
| 438 if (@_>=6) {$imax=$_[5];} | |
| 439 if (@_>=7) {$jmin=$_[6];} | |
| 440 if (@_>=8) {$jmax=$_[7];} | |
| 441 if (@_>=9) {$Sstr=$_[8];} | |
| 442 if (@_>=10) {$rS=$_[9];} | |
| 443 | |
| 444 if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;} | |
| 445 if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;} | |
| 446 | |
| 447 my @xchr; # ASCII characters of $xseq | |
| 448 my @ychr; # ASCII characters of $yseq | |
| 449 my @xres; # internal integer representation of residues of x | |
| 450 my @yres; # internal integer representation of residues of y | |
| 451 | |
| 452 $$xseq =~ s/\s//g; | |
| 453 $$yseq =~ s/\s//g; | |
| 454 @xchr = split(//,$$xseq); | |
| 455 @ychr = split(//,$$yseq); | |
| 456 | |
| 457 my $Lx=@xchr; # length of sequence x | |
| 458 my $Ly=@ychr; # length of sequence y | |
| 459 my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state | |
| 460 my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x | |
| 461 my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y | |
| 462 my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B | |
| 463 my @Abt; # $Abt[a][b] = 0:A 1:M | |
| 464 my @Bbt; # $Bbt[a][b] = 0:B 1:M | |
| 465 my $score; # bit score of alignment | |
| 466 my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0) | |
| 467 my $state; # STOP:0 M:1 A:2 B:3 | |
| 468 my ($i, $j); # indices for sequence x and y, respectively | |
| 469 | |
| 470 my $dx = $main::dx; | |
| 471 my $dy = $main::dy; | |
| 472 if (! defined $dx) {$dx = $main::d;} | |
| 473 if (! defined $dy) {$dy = $main::d;} | |
| 474 printf("dx=%f dy=%f\n",$dx,$dy); ##############DEBUG############# | |
| 475 | |
| 476 # Transform @xres and @yres to integer | |
| 477 for ($i=0; $i<@xchr; $i++) { | |
| 478 my $a=ord(uc($xchr[$i])); | |
| 479 if ($a<65 || $a>90) { | |
| 480 if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) { | |
| 481 printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]); | |
| 482 } | |
| 483 $xres[$i]=21; | |
| 484 } else { | |
| 485 $xres[$i]=$ch2i[$a-65]; | |
| 486 } | |
| 487 } | |
| 488 for ($j=0; $j<@ychr; $j++) { | |
| 489 my $a=ord(uc($ychr[$j])); | |
| 490 if ($a<65 || $a>90) { | |
| 491 if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) { | |
| 492 printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]); | |
| 493 } | |
| 494 $yres[$j]=21; | |
| 495 } else { | |
| 496 $yres[$j]=$ch2i[$a-65]; | |
| 497 } | |
| 498 } | |
| 499 unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element | |
| 500 unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element | |
| 501 | |
| 502 &SetSubstitutionMatrix; | |
| 503 | |
| 504 # Initialization | |
| 505 $M[0][0]=$A[0][0]=$B[0][0]=0; | |
| 506 for ($i=1; $i<=$Lx; $i++) { | |
| 507 $M[$i][0] = -999; | |
| 508 $A[$i][0] = -999; | |
| 509 $B[$i][0] = -$i*$main::g; | |
| 510 $Bbt[$i][0] = 0; # previous state was B as well (gap in y) | |
| 511 } | |
| 512 for ($j=1; $j<=$Ly; $j++) { | |
| 513 $M[0][$j] = -999; | |
| 514 $A[0][$j] = -$j*$main::g; | |
| 515 $B[0][$j] = -999; | |
| 516 $Abt[0][$j] = 0; # previous state was A as well (gap in x) | |
| 517 } | |
| 518 | |
| 519 # Iteration | |
| 520 for ($i=1; $i<=$Lx; ++$i) { | |
| 521 my $Mi =$M[$i]; | |
| 522 my $Mi1=$M[$i-1]; | |
| 523 my $Ai =$A[$i]; | |
| 524 my $Ai1=$A[$i-1]; | |
| 525 my $Bi =$B[$i]; | |
| 526 my $Bi1=$B[$i-1]; | |
| 527 my $Sabx=$Sab[$xres[$i]]; | |
| 528 my $j1=0; | |
| 529 for ($j=1; $j<=$Ly; ++$j, ++$j1) { | |
| 530 ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]]; | |
| 531 ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]); | |
| 532 ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]); | |
| 533 } | |
| 534 } | |
| 535 | |
| 536 # Finding maximum | |
| 537 $score = -1000; | |
| 538 for ($i=1; $i<=$Lx; $i++) { | |
| 539 my $endgappenalty = ($Lx-$i)*$main::g; | |
| 540 if ($M[$i][$Ly]-$endgappenalty > $score) { | |
| 541 $score=$M[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 1; | |
| 542 } | |
| 543 if ($A[$i][$Ly]-$endgappenalty > $score) { | |
| 544 $score=$A[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 2; | |
| 545 } | |
| 546 if ($B[$i][$Ly]-$endgappenalty > $score) { | |
| 547 $score=$B[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 3; | |
| 548 } | |
| 549 } | |
| 550 for ($j=1; $j<$Ly; $j++) { | |
| 551 my $endgappenalty = ($Ly-$j)*$main::g; | |
| 552 if ($M[$Lx][$j]-$endgappenalty > $score) { | |
| 553 $score=$M[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 1; | |
| 554 } | |
| 555 if ($A[$Lx][$j]-$endgappenalty > $score) { | |
| 556 $score=$A[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 2; | |
| 557 } | |
| 558 if ($B[$Lx][$j]-$endgappenalty > $score) { | |
| 559 $score=$B[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 3; | |
| 560 } | |
| 561 } | |
| 562 | |
| 563 # Make sure the end gapped regions are also backtraced | |
| 564 if ($$jmax<$Ly) { | |
| 565 $Abt[$Lx][$$jmax+1] = $state; | |
| 566 for ($j=$$jmax+2; $j<=$Ly; $j++) {$Abt[$Lx][$j] = 0;} | |
| 567 $state = 2; | |
| 568 } elsif ($$imax<$Lx) { | |
| 569 $Bbt[$$imax+1][$Ly] = $state; | |
| 570 for ($i=$$imax+2; $i<=$Lx; $i++) {$Bbt[$i][$Ly] = 0;} | |
| 571 $state = 3; | |
| 572 } else { | |
| 573 $state = 1; | |
| 574 } | |
| 575 | |
| 576 | |
| 577 | |
| 578 # Backtracing | |
| 579 @$ri=(); | |
| 580 @$rj=(); | |
| 581 | |
| 582 @$rS=(); | |
| 583 $i=$Lx; $j=$Ly; | |
| 584 $$xseq=""; $$yseq=""; | |
| 585 while ($i || $j) { | |
| 586 if ($state==1) { | |
| 587 # current state is M (match-match) | |
| 588 unshift(@$ri,$i); | |
| 589 unshift(@$rj,$j); | |
| 590 $state = $Mbt[$i][$j]+1; # previous state | |
| 591 $$xseq=$xchr[$i].$$xseq; | |
| 592 $$yseq=$ychr[$j].$$yseq; | |
| 593 unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]); | |
| 594 $$imin=$i; $$jmin=$j; | |
| 595 $i--; $j--; | |
| 596 } elsif ($state==2) { | |
| 597 # current state is A (gap in x) | |
| 598 unshift(@$ri,0); # $ri->[$col]=0 for gap in $x | |
| 599 unshift(@$rj,$j); | |
| 600 $$xseq="-".$$xseq; | |
| 601 $$yseq=$ychr[$j].$$yseq; | |
| 602 $bt = $Abt[$i][$j--]; | |
| 603 if ($bt) { | |
| 604 # previous state was M | |
| 605 if ($i==$Lx || $i==0) { | |
| 606 unshift(@$rS,-$main::g); # end gap | |
| 607 } else { | |
| 608 unshift(@$rS,-$dx); # gap opening | |
| 609 } | |
| 610 $state = 1; | |
| 611 } else { | |
| 612 # previous state was A | |
| 613 if ($i==$Lx || $i==0) { | |
| 614 unshift(@$rS,-$main::g); # end gap | |
| 615 } else { | |
| 616 unshift(@$rS,-$main::e); # gap extension | |
| 617 } | |
| 618 } | |
| 619 } else { | |
| 620 # current state is B (gap in y) | |
| 621 unshift(@$ri,$i); | |
| 622 unshift(@$rj,0); # $j[$col]=0 for gap in $y | |
| 623 $$xseq=$xchr[$i].$$xseq; | |
| 624 $$yseq="-".$$yseq; | |
| 625 $bt = $Bbt[$i--][$j]; | |
| 626 if ($bt) { | |
| 627 # previous state was M | |
| 628 if ($j==$Ly || $j==0) { | |
| 629 unshift(@$rS,-$main::g); # end gap | |
| 630 } else { | |
| 631 unshift(@$rS,-$dy); # gap opening | |
| 632 } | |
| 633 $state = 1; | |
| 634 } else { | |
| 635 # previous state was B | |
| 636 if ($j==$Ly || $j==0) { | |
| 637 unshift(@$rS,-$main::g); # end gap | |
| 638 } else { | |
| 639 unshift(@$rS,-$main::e); # gap extension | |
| 640 } | |
| 641 } | |
| 642 } | |
| 643 } | |
| 644 | |
| 645 # Set annotation string representing match quality | |
| 646 $$Sstr=""; | |
| 647 for (my $col=0; $col<@$ri; $col++) { | |
| 648 if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) { | |
| 649 $$Sstr.=uc($xchr[$ri->[$col]]); | |
| 650 } elsif ($rS->[$col] > 0 ) { | |
| 651 $$Sstr.="+"; | |
| 652 } else { | |
| 653 $$Sstr.="."; | |
| 654 } | |
| 655 } | |
| 656 return $score; | |
| 657 } | |
| 658 | |
| 659 1; |
