Mercurial > repos > hammock > hammock
comparison external_tools/linux/lib/hh/scripts/addss.pl @ 6:2277dd59b9f9 draft
Uploaded
| author | hammock |
|---|---|
| date | Wed, 01 Nov 2017 05:54:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:b7652b7c97bd | 6:2277dd59b9f9 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # addss.pl | |
| 4 # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file. | |
| 5 # Output format is A3M (for input alignments) or HMMER (see User Guide). | |
| 6 | |
| 7 | |
| 8 # HHsuite version 2.0.16 (January 2013) | |
| 9 # | |
| 10 # Reference: | |
| 11 # Remmert M., Biegert A., Hauser A., and Soding J. | |
| 12 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
| 13 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
| 14 | |
| 15 # (C) Johannes Soeding and Michael Remmert, 2012 | |
| 16 | |
| 17 # This program is free software: you can redistribute it and/or modify | |
| 18 # it under the terms of the GNU General Public License as published by | |
| 19 # the Free Software Foundation, either version 3 of the License, or | |
| 20 # (at your option) any later version. | |
| 21 | |
| 22 # This program is distributed in the hope that it will be useful, | |
| 23 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 25 # GNU General Public License for more details. | |
| 26 | |
| 27 # You should have received a copy of the GNU General Public License | |
| 28 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 29 | |
| 30 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
| 31 | |
| 32 use lib $ENV{"HHLIB"}."/scripts"; | |
| 33 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
| 34 use Align; # Needleman-Wunsch and Smith-Waterman alignment functions | |
| 35 use File::Temp qw/ tempfile tempdir /; | |
| 36 use strict; | |
| 37 | |
| 38 my $ss_cit="PSIPRED: Jones DT. (1999) Protein secondary structure prediction based on position-specific scoring matrices. JMB 292:195-202."; | |
| 39 | |
| 40 | |
| 41 # Module needed for aligning DSSP-sequence | |
| 42 | |
| 43 $|= 1; # Activate autoflushing on STDOUT | |
| 44 | |
| 45 # Default values: | |
| 46 our $v=2; # verbose mode | |
| 47 | |
| 48 my $numres=0; # number of residues per line for secondary structure | |
| 49 my $informat="a3m"; # input format | |
| 50 my $neff = 7; # use alignment with this diversity for PSIPRED prediction | |
| 51 my $program=$0; # name of perl script | |
| 52 my $pdbfile; | |
| 53 | |
| 54 my $help=" | |
| 55 addss.pl from HHsuite $VERSION | |
| 56 Add PSIPRED secondary structure prediction (and DSSP annotation) to a multiple sequence alignment (MSA) | |
| 57 or HMMER (multi-)model file. | |
| 58 | |
| 59 If the input file is an MSA, the predicted secondary structure and confidence values are added as | |
| 60 special annotation sequences with names >ss_pred, >ss_conf, and >ss_dssp to the top of the output | |
| 61 A3M alignment. If no output file is given, the output file will have the same name as the input file, | |
| 62 except for the extension being replaced by '.a3m'. Allowed input formats are A3M (default), | |
| 63 A2M/FASTA (-fas, -a2m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm). | |
| 64 | |
| 65 If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary | |
| 66 structure and confidence values are added to each model. In this case the output file name is | |
| 67 obligatory and must be different from the input file name. | |
| 68 | |
| 69 Usage: perl addss.pl <ali_file> [<outfile>] [-fas|-a3m|-clu|-sto] | |
| 70 or perl addss.pl <hhm_file> <outfile> -hmm | |
| 71 \n"; | |
| 72 | |
| 73 # Variable declarations | |
| 74 my $line; | |
| 75 my @seqs; # sequences from infile (except >aa_ and >ss_pred sequences) | |
| 76 my $query_length; | |
| 77 my $header; # header of MSA: everything before first '>' | |
| 78 my $name; # query in fasta format: '>$name [^\n]*\n$qseq\n' | |
| 79 my $qseq; # residues of query sequence | |
| 80 my $infile; | |
| 81 my $outfile; | |
| 82 my $ss_pred=""; # psipred ss states | |
| 83 my $ss_conf=""; # psipred confidence values | |
| 84 my $ss_dssp; # dssp states as string | |
| 85 my $sa_dssp; # relative solvent accessibility from dssp as string {A,B,C,D,E} A:absolutely buried, B:buried, E:exposed | |
| 86 my $aa_dssp; # residues from dssp file as string | |
| 87 my $aa_astr; # residues from infile as string | |
| 88 my $q_match; # number of match states in query sequence | |
| 89 my $xseq; # sequence x returned from Align.pm | |
| 90 my $yseq; # sequence y returned from Align.pm | |
| 91 my $Sstr; # match sequence returned from Align.pm | |
| 92 | |
| 93 ############################################################################################### | |
| 94 # Processing command line input | |
| 95 ############################################################################################### | |
| 96 | |
| 97 if (@ARGV<1) {die ($help);} | |
| 98 | |
| 99 my $options=""; | |
| 100 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} | |
| 101 | |
| 102 #Input format fasta? | |
| 103 if ($options=~s/ -fas\s/ /g) {$informat="fas";} | |
| 104 elsif ($options=~s/ -a2m\s/ /g) {$informat="a2m";} | |
| 105 elsif ($options=~s/ -a3m\s/ /g) {$informat="a3m";} | |
| 106 elsif ($options=~s/ -clu\s/ /g) {$informat="clu";} | |
| 107 elsif ($options=~s/ -sto\s/ /g) {$informat="sto";} | |
| 108 elsif ($options=~s/ -hmm\s/ /g) {$informat="hmm";} | |
| 109 | |
| 110 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;} | |
| 111 | |
| 112 # Set input and output file | |
| 113 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;} | |
| 114 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;} | |
| 115 if ($options=~s/^\s*([^-]\S*) //) {$infile=$1;} | |
| 116 if ($options=~s/^\s*([^-]\S*) //) {$outfile=$1;} | |
| 117 | |
| 118 # Warn if unknown options found or no infile/outfile | |
| 119 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");} | |
| 120 if (!$infile) {print($help); exit(1);} | |
| 121 | |
| 122 my $v2 = $v-1; | |
| 123 if ($v2>2) {$v2--;} | |
| 124 if ($v2<0) {$v2=0;} | |
| 125 | |
| 126 if ($informat eq "hmm" && !$outfile) { | |
| 127 print("Error: no output file given. With the -hmm option an output file is obligatory\n"); exit(1); | |
| 128 } | |
| 129 | |
| 130 ############################################################################################### | |
| 131 # Reformat input alignment to a3m and psiblast-readable format and generate file with query sequence | |
| 132 ############################################################################################### | |
| 133 | |
| 134 my $inbase; # $inbasename of infile: remove extension | |
| 135 my $inroot; # $inbasename of infile: remove path and extension | |
| 136 if ($infile=~/(.*)\..*/) {$inbase=$1;} else {$inbase=$infile;} # remove extension | |
| 137 if ($inbase=~/.*\/(.*)/) {$inroot=$1;} else {$inroot=$inbase;} # remove path | |
| 138 | |
| 139 # Create tmpfile | |
| 140 my $tmpdir; | |
| 141 if ($v<=3) {$tmpdir = tempdir( CLEANUP => 1);} else {$tmpdir = tempdir( CLEANUP => 0);} | |
| 142 my ($tmpf, $tmpfile) = tempfile( DIR => $tmpdir ); | |
| 143 my $tmpfile_no_dir; | |
| 144 if ($tmpfile=~/.*\/(.*)/) {$tmpfile_no_dir=$1;} else {$tmpfile_no_dir=$tmpfile;} # remove path | |
| 145 | |
| 146 | |
| 147 | |
| 148 ############################################################################################ | |
| 149 | |
| 150 if ($informat ne "hmm") { | |
| 151 if (!$outfile) {$outfile="$inbase.a3m";} | |
| 152 | |
| 153 # Use first sequence to define match states and reformat input file to a3m and psi | |
| 154 if ($informat ne "a3m") { | |
| 155 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -M first $informat a3m $infile $tmpfile.in.a3m"); | |
| 156 } else { | |
| 157 &HHPaths::System("cp $infile $tmpfile.in.a3m"); | |
| 158 } | |
| 159 | |
| 160 # Read query sequence | |
| 161 open (INFILE, "<$tmpfile.in.a3m") or die ("ERROR: cannot open $tmpfile.in.a3m!\n"); | |
| 162 $/=">"; # set input field separator | |
| 163 my $i=0; | |
| 164 $qseq=""; | |
| 165 $header = <INFILE>; | |
| 166 $header =~s />$//; | |
| 167 while ($line=<INFILE>) { | |
| 168 $line=~s/>$//; | |
| 169 if ($line=~/^ss_/ || $line=~/^aa_/) {next;} | |
| 170 $seqs[$i++]=">$line"; | |
| 171 if(!$qseq) { | |
| 172 $line=~s/^(.*)[^\n]*//; | |
| 173 $name=$1; | |
| 174 $qseq=$line; | |
| 175 $qseq=~s/\n//g; | |
| 176 } | |
| 177 } | |
| 178 close(INFILE); | |
| 179 $/="\n"; # set input field separator | |
| 180 | |
| 181 if ($qseq =~ /\-/) { | |
| 182 | |
| 183 # First sequence contains gaps => calculate consensus sequence | |
| 184 &HHPaths::System("hhconsensus -i $tmpfile.in.a3m -s $tmpfile.sq -o $tmpfile.in.a3m > /dev/null"); | |
| 185 | |
| 186 } else { | |
| 187 | |
| 188 $query_length = ($qseq=~tr/A-Z/A-Z/); | |
| 189 $qseq=~tr/A-Z//cd; # remove everything except capital letters | |
| 190 | |
| 191 # Write query sequence file in FASTA format | |
| 192 open (QFILE, ">$tmpfile.sq") or die("ERROR: can't open $tmpfile.sq: $!\n"); | |
| 193 printf(QFILE ">%s\n%s\n",$name,$qseq); | |
| 194 close (QFILE); | |
| 195 } | |
| 196 | |
| 197 # Filter alignment to diversity $neff | |
| 198 if ($v>=1) {printf ("Filtering alignment to diversity $neff ...\n");} | |
| 199 &HHPaths::System("hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m"); | |
| 200 | |
| 201 # Reformat into PSI-BLAST readable file for jumpstarting | |
| 202 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -r -noss a3m psi $tmpfile.in.a3m $tmpfile.in.psi"); | |
| 203 | |
| 204 open (ALIFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n"); | |
| 205 printf (ALIFILE "%s",$header); | |
| 206 | |
| 207 # Add DSSP sequence (if available) | |
| 208 if ($dssp ne "") { | |
| 209 if (!&AppendDsspSequences("$tmpfile.sq")) { | |
| 210 if ($numres) { | |
| 211 $ss_dssp=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
| 212 } | |
| 213 printf (ALIFILE ">ss_dssp\n%s\n",$ss_dssp); | |
| 214 if ($v>=1) {print("\nAdding DSSP state sequence ...\n");} | |
| 215 } | |
| 216 } | |
| 217 | |
| 218 # Secondary structure prediction with psipred | |
| 219 if ($v>=2) {print("Predicting secondary structure with PSIPRED ... ");} | |
| 220 &RunPsipred("$tmpfile.sq"); | |
| 221 | |
| 222 if (open (PSIPREDFILE, "<$tmpfile.horiz")) { | |
| 223 $ss_conf=""; | |
| 224 $ss_pred=""; | |
| 225 # Read Psipred file | |
| 226 while ($line=<PSIPREDFILE>) { | |
| 227 if ($line=~/^Conf:\s+(\S+)/) {$ss_conf.=$1;} | |
| 228 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;} | |
| 229 } | |
| 230 close(PSIPREDFILE); | |
| 231 $ss_conf=~tr/0-9/0/c; # replace all non-numerical symbols with a 0 | |
| 232 if ($numres) { | |
| 233 $ss_pred=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
| 234 $ss_conf=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
| 235 } | |
| 236 printf(ALIFILE ">ss_pred PSIPRED predicted secondary structure\n%s\n",$ss_pred); | |
| 237 printf(ALIFILE ">ss_conf PSIPRED confidence values\n%s\n",$ss_conf); | |
| 238 } | |
| 239 | |
| 240 # Append alignment sequences to psipred sequences | |
| 241 for ($i=0; $i<@seqs; $i++) { | |
| 242 printf(ALIFILE "%s",$seqs[$i]); | |
| 243 } | |
| 244 close(ALIFILE); | |
| 245 if ($v>=2) {print("done \n");} | |
| 246 } | |
| 247 ############################################################## | |
| 248 # HMMER format | |
| 249 else | |
| 250 { | |
| 251 if (!$outfile) {$outfile="$inbase.hmm";} | |
| 252 | |
| 253 my $log2 = log(2); | |
| 254 my @logoddsmat; | |
| 255 my @lines; | |
| 256 my $length; | |
| 257 my $query; | |
| 258 my $scale=0.13; # empirically determined scale factor between HMMER bit score and PSI-BLAST score, 0.3 for HMMER3 | |
| 259 my $acc; | |
| 260 my $name; | |
| 261 my $desc; | |
| 262 my $nmodels=0; | |
| 263 | |
| 264 open (INFILE, "<$infile") || die("ERROR: cannot open $infile: $!\n"); | |
| 265 open (OUTFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n"); | |
| 266 | |
| 267 # Read HMMER file model by model | |
| 268 while ($line=<INFILE>) { | |
| 269 # Search for start of next model | |
| 270 while ($line && $line!~/^HMMER/ && $line!~/^NAME /) { | |
| 271 $line=<INFILE>; | |
| 272 } | |
| 273 if ($line=~/^HMMER3/) { | |
| 274 | |
| 275 $scale = 0.3; | |
| 276 @logoddsmat=(); | |
| 277 @lines=($line); | |
| 278 | |
| 279 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}} | |
| 280 $line=~/^LENG\s+(\d+)/; | |
| 281 $length=$1; # number of match states in HMM | |
| 282 $query=""; # query residues from NULL emission lines | |
| 283 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}} | |
| 284 push(@lines,$line=<INFILE>); | |
| 285 if ($line !~ /^\s*COMPO/) { | |
| 286 die("Error: need null-model probablities (Parameter COMPO)!\n"); | |
| 287 } | |
| 288 $line=~s/^\s*COMPO\s+(\S.*\S)\s*$/$1/; | |
| 289 my @nullmodel = split(/\s+/,$line); | |
| 290 @nullmodel = map {$_ = exp(-1 * $_)} @nullmodel; # Transform to probabilities | |
| 291 push(@lines,$line=<INFILE>); # state 0 insert emission | |
| 292 push(@lines,$line=<INFILE>); # transisitions from begin state | |
| 293 | |
| 294 while ($line=<INFILE>) { | |
| 295 push(@lines,$line); | |
| 296 if ($line=~/^\/\//) {last;} | |
| 297 $line=~s/^\s*\d+\s+(\S.*\S)\s+\d+\s+(\S)\s+\S\s*$/$1/; | |
| 298 $query .= $2; | |
| 299 my @probs = split(/\s+/,$line); | |
| 300 @probs = map {$_ = exp(-1 * $_)} @probs; # Transform to probabilities | |
| 301 # calculate log-odds | |
| 302 my @logodds = (); | |
| 303 for (my $a = 0; $a < scalar(@probs); $a++) { | |
| 304 my $logodd = (log($probs[$a] / $nullmodel[$a]) / $log2) * 1000; | |
| 305 push(@logodds, $logodd); | |
| 306 } | |
| 307 | |
| 308 push(@logoddsmat,\@logodds); | |
| 309 push(@lines,$line=<INFILE>); | |
| 310 push(@lines,$line=<INFILE>); | |
| 311 } | |
| 312 | |
| 313 } else { | |
| 314 | |
| 315 $scale=0.13; | |
| 316 if ($line!~/^HMMER/ && $line!~/^NAME /) {last;} # first line in each model must begin with 'HMMER...' | |
| 317 @logoddsmat=(); | |
| 318 @lines=($line); | |
| 319 | |
| 320 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}} | |
| 321 $line=~/^LENG\s+(\d+)/; | |
| 322 $length=$1; # number of match states in HMM | |
| 323 $query=""; # query residues from NULL emission lines | |
| 324 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}} | |
| 325 push(@lines,$line=<INFILE>); | |
| 326 while ($line=<INFILE>) { | |
| 327 push(@lines,$line); | |
| 328 if ($line=~/^\/\//) {last;} | |
| 329 $line=~s/^\s*\d+\s+(\S.*\S)\s*$/$1/; | |
| 330 my @logodds = split(/\s+/,$line); | |
| 331 push(@logoddsmat,\@logodds); | |
| 332 push(@lines,$line=<INFILE>); | |
| 333 $line=~/^\s*(\S)/; | |
| 334 $query .= $1; | |
| 335 push(@lines,$line=<INFILE>); | |
| 336 } | |
| 337 } | |
| 338 | |
| 339 # Write mtx matrix | |
| 340 open (MTXFILE, ">$tmpfile.mtx") || die("ERROR: cannot open $tmpfile.mtx: $!\n"); | |
| 341 printf(MTXFILE "%i\n",$length); | |
| 342 printf(MTXFILE "%s\n",$query); | |
| 343 printf(MTXFILE "2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n"); | |
| 344 while (@logoddsmat) { | |
| 345 my @logodds = @{shift(@logoddsmat)}; | |
| 346 print(MTXFILE "-32768 "); | |
| 347 splice(@logodds, 1,0,-32768/$scale); # insert logodds value for B | |
| 348 splice(@logodds,20,0, -100/$scale); # insert logodds value for X | |
| 349 splice(@logodds,22,0,-32768/$scale); # insert logodds value for Z | |
| 350 for (my $i=0; $i<23; $i++) { | |
| 351 printf(MTXFILE "%4.0f ",$scale*$logodds[$i]); | |
| 352 } | |
| 353 print(MTXFILE "-32768 -400\n"); | |
| 354 } | |
| 355 close(MTXFILE); | |
| 356 | |
| 357 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
| 358 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0 | |
| 359 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss"); | |
| 360 } else { | |
| 361 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss"); | |
| 362 } | |
| 363 | |
| 364 # READ PSIPRED file | |
| 365 if (open (PSIPRED, "$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss |")) { | |
| 366 $ss_conf=""; | |
| 367 $ss_pred=""; | |
| 368 # Read Psipred file | |
| 369 while ($line=<PSIPRED>) { | |
| 370 if ($line=~/^Conf:\s+(\d+)/) {$ss_conf.=$1;} | |
| 371 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;} | |
| 372 } | |
| 373 close(PSIPREDFILE); | |
| 374 } | |
| 375 | |
| 376 # Add secondary structure to HMMER output file and print | |
| 377 foreach $line (@lines) { | |
| 378 if ($line=~/^SSPRD/ || $line=~/^SSCON/|| $line=~/^SSCIT/) {next;} | |
| 379 if ($line=~/^HMM /) { | |
| 380 $ss_pred=~s/(\S{80})/$1\nSSPRD /g; # insert a line break every 80 residues | |
| 381 $ss_conf=~s/(\S{80})/$1\nSSCON /g; # insert a line break every 80 residues | |
| 382 printf(OUTFILE "SSCIT HHsearch-readable PSIPRED secondary structure prediction:\n"); | |
| 383 printf(OUTFILE "SSPRD %s\n",$ss_pred); | |
| 384 printf(OUTFILE "SSCON %s\n",$ss_conf); | |
| 385 printf(OUTFILE "SSCIT %s\n",$ss_cit); | |
| 386 } | |
| 387 printf(OUTFILE $line); | |
| 388 } | |
| 389 $nmodels++; | |
| 390 } | |
| 391 | |
| 392 close(OUTFILE); | |
| 393 close(INFILE); | |
| 394 &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2"); | |
| 395 if ($v>=2) {printf("Added PSIPRED secondary structure to %i models\n",$nmodels);} | |
| 396 } | |
| 397 | |
| 398 if ($v<=3) { | |
| 399 unlink("$tmpfile.in.a3m"); | |
| 400 unlink("$tmpfile.in.psi"); | |
| 401 unlink("$tmpfile.horiz"); | |
| 402 unlink("$tmpfile.dssp"); | |
| 403 } | |
| 404 | |
| 405 exit; | |
| 406 | |
| 407 ############################################################################################## | |
| 408 # Run SS prediction starting from alignment in $tmpfile.in.psi (called by BuildAlignment) | |
| 409 ############################################################################################## | |
| 410 sub RunPsipred() { | |
| 411 # This is a simple script which will carry out all of the basic steps | |
| 412 # required to make a PSIPRED V2 prediction. Note that it assumes that the | |
| 413 # following programs are in the appropriate directories: | |
| 414 # blastpgp - PSIBLAST executable (from NCBI toolkit) | |
| 415 # makemat - IMPALA utility (from NCBI toolkit) | |
| 416 # psipred - PSIPRED V2 program | |
| 417 # psipass2 - PSIPRED V2 program | |
| 418 | |
| 419 my $infile=$_[0]; | |
| 420 my $basename; #file name without extension | |
| 421 my $rootname; #basename without directory path | |
| 422 if ($infile =~/^(.*)\..*?$/) {$basename=$1;} else {$basename=$infile;} | |
| 423 if ($basename=~/^.*\/(.*?)$/) {$rootname=$1;} else {$rootname=$basename;} | |
| 424 | |
| 425 # Does dummy database exist? | |
| 426 if (!-e "$dummydb.phr") { | |
| 427 if (!-e "$dummydb") {die "Error in addss.pl: Could not find $dummydb\n";} | |
| 428 | |
| 429 &HHPaths::System("cp $infile $dummydb"); | |
| 430 &HHPaths::System("$ncbidir/formatdb -i $dummydb"); | |
| 431 if (!-e "$dummydb.phr") {die "Error in addss.pl: Could not find nor create index files for $dummydb\n";} | |
| 432 } | |
| 433 | |
| 434 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
| 435 &HHPaths::System("$ncbidir/blastpgp -b 1 -j 1 -h 0.001 -d $dummydb -i $infile -B $tmpfile.in.psi -C $tmpfile.chk 1> $tmpfile.blalog 2> $tmpfile.blalog"); | |
| 436 | |
| 437 #print("Predicting secondary structure...\n"); | |
| 438 | |
| 439 &HHPaths::System("echo "."$tmpfile_no_dir".".chk > $tmpfile.pn\n"); | |
| 440 &HHPaths::System("echo "."$tmpfile_no_dir".".sq > $tmpfile.sn\n"); | |
| 441 &HHPaths::System("$ncbidir/makemat -P $tmpfile"); | |
| 442 | |
| 443 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
| 444 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0 | |
| 445 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss"); | |
| 446 } else { | |
| 447 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss"); | |
| 448 } | |
| 449 | |
| 450 &HHPaths::System("$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss > $tmpfile.horiz"); | |
| 451 | |
| 452 # Remove temporary files | |
| 453 if ($v<=3) { unlink(split ' ', "$tmpfile.pn $tmpfile.sn $tmpfile.mn $tmpfile.chk $tmpfile.blalog $tmpfile.mtx $tmpfile.aux $tmpfile.ss $tmpfile.ss2 $tmpfile.sq");} | |
| 454 return; | |
| 455 } | |
| 456 | |
| 457 ############################################################################################## | |
| 458 # Read query sequence and extract dssp sequence | |
| 459 ############################################################################################## | |
| 460 sub AppendDsspSequences() { | |
| 461 my $qfile=$_[0]; | |
| 462 | |
| 463 my $line; #input line | |
| 464 my $name; #name of sequence in in file, e.g. d1g8ma1 | |
| 465 my $qrange; #chain and residue range of query sequence | |
| 466 my $aas=""; #amino acids from in file for each $name | |
| 467 | |
| 468 my $dsspfile; | |
| 469 my $pdbfile; | |
| 470 my $pdbcode; #pdb code for accessing dssp file; shortened from in code, e.g. 1g8m | |
| 471 my @ss_dssp=(); #dssp states for residues (H,E,L) | |
| 472 my @sa_dssp=(); #dssp states for residues (H,E,L) | |
| 473 my @aa_dssp=(); #residues in dssp file | |
| 474 my @aa_astr=(); #residues from infile | |
| 475 my $length; #length of sequence | |
| 476 | |
| 477 # Default parameters for Align.pm | |
| 478 our $d=3; # gap opening penatlty for Align.pm | |
| 479 our $e=0.1; # gap extension penatlty for Align.pm | |
| 480 our $g=0.09; # endgap penatlty for Align.pm | |
| 481 our $matrix="identity"; | |
| 482 | |
| 483 # Read query sequence -> $name, $nameline, $range, $aas | |
| 484 open (QFILE, "<$qfile") || die ("cannot open $qfile: $!"); | |
| 485 while ($line=<QFILE>) { | |
| 486 if ($line=~/>(\S+)/) { | |
| 487 $name=$1; | |
| 488 | |
| 489 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1) | |
| 490 if ($line=~/^>[defgh](\d[a-z0-9]{3})[a-z0-9_.][a-z0-9_]\s+[a-z]\.\d+\.\d+\.\d+\s+\((\S+)\)/) { | |
| 491 $pdbcode=$1; | |
| 492 $qrange=$2; | |
| 493 } | |
| 494 | |
| 495 # PDB ID? (8fab_A, 1a0i) | |
| 496 elsif ($line=~/^>(\d[a-z0-9]{3})_?(\S?)\s/) { | |
| 497 $pdbcode=$1; | |
| 498 if ($2 ne "") {$qrange="$2:";} else {$qrange="-";} | |
| 499 } | |
| 500 | |
| 501 # DALI ID? (8fabA_0,1a0i_2) | |
| 502 elsif ($line=~/^>(\d[a-z0-9]{3})[A-Za-z0-9]?_\d+\s+\d+\.\d+.\d+.\d+.\d+.\d+\s+\((\S+)\)/) { | |
| 503 $pdbcode=$1; | |
| 504 $qrange=$2; | |
| 505 } | |
| 506 | |
| 507 else { | |
| 508 if ($v>=3) {print("Warning: no pdb code found in sequence name '$name'\n");} | |
| 509 close(QFILE); | |
| 510 return 1; # no astral/DALI/pdb sequence => no dssp states available | |
| 511 } | |
| 512 $aas=""; | |
| 513 | |
| 514 } | |
| 515 else | |
| 516 { | |
| 517 chomp($line); | |
| 518 $line=~tr/a-z \t/A-Z/d; | |
| 519 $aas.=$line; | |
| 520 } | |
| 521 } | |
| 522 close(QFILE); | |
| 523 if ($v>=3) {printf("Searching DSSP state assignments: name=%s range=%s\n",$name,$qrange);} | |
| 524 | |
| 525 # Try to open dssp file | |
| 526 $dsspfile="$dsspdir/$pdbcode.dssp"; | |
| 527 if (! open (DSSPFILE, "<$dsspfile")) { | |
| 528 if ($v>=3) {printf(STDERR "Warning in $program: Cannot open $dsspfile!\n");} | |
| 529 $pdbfile = &OpenPDBfile($pdbcode); | |
| 530 if ($pdbfile eq "") {return;} | |
| 531 | |
| 532 system("$dssp $pdbfile $tmpfile.dssp 2> /dev/null"); | |
| 533 system("cp $tmpfile.dssp $dsspfile 2> /dev/null"); | |
| 534 $dsspfile="$tmpfile.dssp"; | |
| 535 if (! open (DSSPFILE, "<$dsspfile")) { | |
| 536 if ($v>=3) {printf(STDERR "Warning in $program: dssp couldn't generate file from $pdbfile. Skipping $name\n");} | |
| 537 return 1; | |
| 538 } | |
| 539 } | |
| 540 | |
| 541 #....+....1....+....2....+....3....+....4 | |
| 542 # # RESIDUE AA STRUCTURE BP1 BP2 ACC etc. | |
| 543 # 623 630 A R < 0 0 280 etc. | |
| 544 # 624 !* 0 0 0 etc. | |
| 545 # 625 8 B A 0 0 105 etc. | |
| 546 # 626 9 B P >> - 0 0 71 etc. | |
| 547 # 292 28SA K H 4 S+ 0 0 71 etc. (1qdm.dssp) | |
| 548 # 293 29SA K H > S+ 0 0 28 etc. | |
| 549 | |
| 550 # Read in whole DSSP file | |
| 551 for (my $try = 1; $try<=2; $try++) { | |
| 552 $aa_dssp=""; | |
| 553 $sa_dssp=""; | |
| 554 $ss_dssp=""; | |
| 555 while ($line=<DSSPFILE>) {if ($line=~/^\s*\#\s*RESIDUE\s+AA/) {last;}} | |
| 556 while ($line=<DSSPFILE>) | |
| 557 { | |
| 558 if ($line=~/^.{5}(.{5})(.)(.)\s(.).\s(.).{18}(...)/) | |
| 559 { | |
| 560 my $thisres=$1; | |
| 561 my $icode=$2; | |
| 562 my $chain=$3; | |
| 563 my $aa=$4; | |
| 564 my $ss=$5; | |
| 565 my $sa=$6; | |
| 566 my $contained=0; | |
| 567 my $range=$qrange; | |
| 568 if ($aa eq "!") {next;} # missing residues! | |
| 569 $thisres=~tr/ //d; | |
| 570 $chain=~tr/ //d; | |
| 571 $icode=~tr/ //d; | |
| 572 $sa=~tr/ //d; | |
| 573 if ($try==1) { | |
| 574 do{ | |
| 575 if ($range=~s/^(\S):(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq $1 && $icode eq $4 && $2<=$thisres && $thisres<=$3) { | |
| 576 $contained=1; #syntax (A:56S-135S) | |
| 577 } | |
| 578 elsif ($range=~s/^(\S):(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq $1 && $2<=$thisres && $thisres<=$3) { | |
| 579 $contained=1; #syntax (R:56-135) | |
| 580 } | |
| 581 elsif ($range=~s/^(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq "" && $icode eq $3 && $1<=$thisres && $thisres<=$2) { | |
| 582 $contained=1; #syntax (56-135) | |
| 583 } | |
| 584 elsif ($range=~s/^(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq "" && $1<=$thisres && $thisres<=$2) { | |
| 585 $contained=1; #syntax (56-135) | |
| 586 } | |
| 587 elsif ($range=~s/^(\S):// && $chain eq $1) { | |
| 588 $contained=1; #syntax (A:) or (A:,2:) | |
| 589 } | |
| 590 elsif ($range=~s/^-$// && $chain eq "") { | |
| 591 $contained=1; #syntax (-) | |
| 592 } | |
| 593 $range=~s/^,//; | |
| 594 # print("qrange=$qrange range='$range' ires=$thisres chain=$chain contained=$contained\n"); | |
| 595 } while($contained==0 && $range ne ""); | |
| 596 if ($contained==0) {next;} | |
| 597 } # end if try==1 | |
| 598 $aa_dssp.=$aa; | |
| 599 $ss_dssp.=$ss; | |
| 600 $sa_dssp.=&sa2c($sa,$aa); | |
| 601 } | |
| 602 } | |
| 603 # if not enough residues were found: chain id is wrong => repeat extraction without checking chain id | |
| 604 if (length($aa_dssp)>=10) {last;} | |
| 605 close(DSSPFILE); | |
| 606 open (DSSPFILE, "<$dsspfile"); | |
| 607 } | |
| 608 close(DSSPFILE); | |
| 609 | |
| 610 if (length($aa_dssp)==0) {print("WARNING: no residues found in $dsspdir/$pdbcode.dssp\n"); return 1;} | |
| 611 if (length($aa_dssp)<=20) {printf("WARNING: only %i residues found in $dsspdir/$pdbcode.dssp\n",length($aa_dssp)); return 1;} | |
| 612 | |
| 613 # Postprocess $aa_dssp etc | |
| 614 $aa_dssp =~ tr/a-z/CCCCCCCCCCCCCCCCCCCCCCCCCC/; | |
| 615 $ss_dssp =~ tr/ I/CC/; | |
| 616 $ss_dssp =~ s/ \S / /g; | |
| 617 $ss_dssp =~ s/ \S\S / /g; | |
| 618 | |
| 619 # Align query with dssp sequence | |
| 620 $aa_astr = $aas; | |
| 621 $xseq=$aas; | |
| 622 $yseq=$aa_dssp; | |
| 623 my ($imax,$imin,$jmax,$jmin); | |
| 624 my (@i,@j); | |
| 625 my $score=&AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr); | |
| 626 | |
| 627 # Initialize strings (=arrays) for dssp states with "----...-" | |
| 628 my @ss_dssp_ali=(); # $ss_dssp_ali[$i] is dssp state aligned to $aa_astr[$i] | |
| 629 my @sa_dssp_ali=(); # $sa_dssp_ali[$i] is solvent accessibility string | |
| 630 my @aa_dssp_ali=(); # $aa_dssp_ali[$i] is dssp residue aligned to $aa_astr[$i] | |
| 631 for (my $i=0; $i<=length($aa_astr); $i++) { # sum up to len+1 | |
| 632 # because 0'th element in @ss_dssp and @aa_dssp is dummy "-" | |
| 633 $ss_dssp_ali[$i]="-"; | |
| 634 $sa_dssp_ali[$i]="-"; | |
| 635 $aa_dssp_ali[$i]="-"; | |
| 636 } | |
| 637 | |
| 638 # To each residue (from i=0 to len-1) of input sequence $aa_astr assign aligned dssp state | |
| 639 @ss_dssp = split(//,$ss_dssp); | |
| 640 @sa_dssp = split(//,$sa_dssp); | |
| 641 @aa_dssp = split(//,$aa_dssp); | |
| 642 @aa_astr = split(//,$aa_astr); | |
| 643 my $len = 0; | |
| 644 unshift(@aa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
| 645 unshift(@ss_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
| 646 unshift(@sa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
| 647 unshift(@aa_astr,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
| 648 for (my $col=0; $col<@i; $col++) { | |
| 649 if ($i[$col]>0) { | |
| 650 if ($j[$col]>0) {$len++;} # count match states (for score/len calculation) | |
| 651 $ss_dssp_ali[$i[$col]]=$ss_dssp[$j[$col]]; | |
| 652 $sa_dssp_ali[$i[$col]]=$sa_dssp[$j[$col]]; | |
| 653 $aa_dssp_ali[$i[$col]]=$aa_dssp[$j[$col]]; | |
| 654 } | |
| 655 if ($v>=4) { | |
| 656 printf ("%s %3i %s %3i\n",$aa_astr[$i[$col]],$i[$col],$aa_dssp[$j[$col]],$j[$col]); | |
| 657 } | |
| 658 } | |
| 659 shift (@ss_dssp_ali); # throw out first "-" | |
| 660 shift (@sa_dssp_ali); # throw out first "-" | |
| 661 shift (@aa_dssp_ali); # throw out first "-" | |
| 662 $aa_dssp=join("",@aa_dssp_ali); | |
| 663 $ss_dssp=join("",@ss_dssp_ali); | |
| 664 $sa_dssp=join("",@sa_dssp_ali); | |
| 665 | |
| 666 # Debugging output | |
| 667 if ($v>=4) {printf(STDOUT "DSSP: %s: length=%-3i score/len:%-5.3f\n",$name,$len,$score/$len);} | |
| 668 if ($v>=4) { | |
| 669 printf("IN: %s\n",$xseq); | |
| 670 printf("MATCH: %s\n",$Sstr); | |
| 671 printf("DSSP: %s\n",$yseq); | |
| 672 printf("\n"); | |
| 673 printf(">ss_dssp $name\n$ss_dssp\n"); | |
| 674 printf(">sa_dssp $name\n$sa_dssp\n"); | |
| 675 printf(">aa_dssp $name\n$aa_dssp\n"); | |
| 676 printf(">aa_astra $name\n$aa_astr\n\n"); | |
| 677 } | |
| 678 if ($score/$len<0.5) { | |
| 679 printf (STDOUT "\nWARNING: in $name: alignment score with dssp residues too low: Score/len=%f.\n\n",$score/$len); | |
| 680 printf("IN: %s\n",$xseq); | |
| 681 printf("MATCH: %s\n",$Sstr); | |
| 682 printf("DSSP: %s\n",$yseq); | |
| 683 return 1; | |
| 684 } | |
| 685 | |
| 686 return 0; | |
| 687 } | |
| 688 | |
| 689 ################################################################################################ | |
| 690 ### Return solvent accessibility code | |
| 691 ################################################################################################ | |
| 692 sub sa2c () | |
| 693 { | |
| 694 my %maxsa = (A=>106, B=>160, C=>135, D=>163, E=>194, F=>197, G=>84, H=>184, I=>169, K=>205, L=>164, M=>188, | |
| 695 N=>157, P=>136, Q=>198, R=>248, S=>130, T=>142, V=>142, W=>227, X=>180, Y=>222, Z=>196); # maximum solvent accessiblity | |
| 696 if ($_[1]=~/[a-z]/) {return "F";} # disulphide bridge | |
| 697 if (!defined $maxsa{$_[1]}) {return "-";} # no amino acid | |
| 698 my $rsa=$_[0]/$maxsa{$_[1]}; | |
| 699 # printf("aa=%s sa=%5.1f max_sa=%5.1f rsa=%5.3f\n",$_[1],$_[0],$maxsa{$_[1]},$rsa); | |
| 700 if ($rsa<=0.02) {return "A";} | |
| 701 elsif ($rsa<=0.14) {return "B";} | |
| 702 elsif ($rsa<=0.33) {return "C";} | |
| 703 elsif ($rsa<=0.55) {return "D";} | |
| 704 else {return "E";} | |
| 705 } | |
| 706 | |
| 707 # Find the pdb file with $pdbcode in pdb directory | |
| 708 sub OpenPDBfile() { | |
| 709 | |
| 710 my $pdbcode=lc($_[0]); | |
| 711 if (! -e "$pdbdir") { | |
| 712 if ($v>=3) {print(STDERR "Warning in $program: pdb directory '$pdbdir' does not exist!\n");} | |
| 713 return 1; | |
| 714 } | |
| 715 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";} | |
| 716 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";} | |
| 717 else {$pdbfile="$pdbdir/";} | |
| 718 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";} | |
| 719 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";} | |
| 720 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";} | |
| 721 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";} | |
| 722 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";} | |
| 723 else { | |
| 724 if ($v>=3) {printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");} | |
| 725 return ""; | |
| 726 } | |
| 727 if (!open (PDBFILE, "$pdbfile")) { | |
| 728 if ($v>=3) {printf(STDERR "Error in $program: Cannot open pdb file: $!\n");} | |
| 729 return ""; | |
| 730 } | |
| 731 return $pdbfile; | |
| 732 } |
