comparison external_tools/linux/lib/hh/scripts/addss.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
comparison
equal deleted inserted replaced
5:b7652b7c97bd 6:2277dd59b9f9
1 #!/usr/bin/env perl
2 #
3 # addss.pl
4 # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file.
5 # Output format is A3M (for input alignments) or HMMER (see User Guide).
6
7
8 # HHsuite version 2.0.16 (January 2013)
9 #
10 # Reference:
11 # Remmert M., Biegert A., Hauser A., and Soding J.
12 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
13 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
14
15 # (C) Johannes Soeding and Michael Remmert, 2012
16
17 # This program is free software: you can redistribute it and/or modify
18 # it under the terms of the GNU General Public License as published by
19 # the Free Software Foundation, either version 3 of the License, or
20 # (at your option) any later version.
21
22 # This program is distributed in the hope that it will be useful,
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 # GNU General Public License for more details.
26
27 # You should have received a copy of the GNU General Public License
28 # along with this program. If not, see <http://www.gnu.org/licenses/>.
29
30 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
31
32 use lib $ENV{"HHLIB"}."/scripts";
33 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
34 use Align; # Needleman-Wunsch and Smith-Waterman alignment functions
35 use File::Temp qw/ tempfile tempdir /;
36 use strict;
37
38 my $ss_cit="PSIPRED: Jones DT. (1999) Protein secondary structure prediction based on position-specific scoring matrices. JMB 292:195-202.";
39
40
41 # Module needed for aligning DSSP-sequence
42
43 $|= 1; # Activate autoflushing on STDOUT
44
45 # Default values:
46 our $v=2; # verbose mode
47
48 my $numres=0; # number of residues per line for secondary structure
49 my $informat="a3m"; # input format
50 my $neff = 7; # use alignment with this diversity for PSIPRED prediction
51 my $program=$0; # name of perl script
52 my $pdbfile;
53
54 my $help="
55 addss.pl from HHsuite $VERSION
56 Add PSIPRED secondary structure prediction (and DSSP annotation) to a multiple sequence alignment (MSA)
57 or HMMER (multi-)model file.
58
59 If the input file is an MSA, the predicted secondary structure and confidence values are added as
60 special annotation sequences with names >ss_pred, >ss_conf, and >ss_dssp to the top of the output
61 A3M alignment. If no output file is given, the output file will have the same name as the input file,
62 except for the extension being replaced by '.a3m'. Allowed input formats are A3M (default),
63 A2M/FASTA (-fas, -a2m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm).
64
65 If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary
66 structure and confidence values are added to each model. In this case the output file name is
67 obligatory and must be different from the input file name.
68
69 Usage: perl addss.pl <ali_file> [<outfile>] [-fas|-a3m|-clu|-sto]
70 or perl addss.pl <hhm_file> <outfile> -hmm
71 \n";
72
73 # Variable declarations
74 my $line;
75 my @seqs; # sequences from infile (except >aa_ and >ss_pred sequences)
76 my $query_length;
77 my $header; # header of MSA: everything before first '>'
78 my $name; # query in fasta format: '>$name [^\n]*\n$qseq\n'
79 my $qseq; # residues of query sequence
80 my $infile;
81 my $outfile;
82 my $ss_pred=""; # psipred ss states
83 my $ss_conf=""; # psipred confidence values
84 my $ss_dssp; # dssp states as string
85 my $sa_dssp; # relative solvent accessibility from dssp as string {A,B,C,D,E} A:absolutely buried, B:buried, E:exposed
86 my $aa_dssp; # residues from dssp file as string
87 my $aa_astr; # residues from infile as string
88 my $q_match; # number of match states in query sequence
89 my $xseq; # sequence x returned from Align.pm
90 my $yseq; # sequence y returned from Align.pm
91 my $Sstr; # match sequence returned from Align.pm
92
93 ###############################################################################################
94 # Processing command line input
95 ###############################################################################################
96
97 if (@ARGV<1) {die ($help);}
98
99 my $options="";
100 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
101
102 #Input format fasta?
103 if ($options=~s/ -fas\s/ /g) {$informat="fas";}
104 elsif ($options=~s/ -a2m\s/ /g) {$informat="a2m";}
105 elsif ($options=~s/ -a3m\s/ /g) {$informat="a3m";}
106 elsif ($options=~s/ -clu\s/ /g) {$informat="clu";}
107 elsif ($options=~s/ -sto\s/ /g) {$informat="sto";}
108 elsif ($options=~s/ -hmm\s/ /g) {$informat="hmm";}
109
110 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;}
111
112 # Set input and output file
113 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
114 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
115 if ($options=~s/^\s*([^-]\S*) //) {$infile=$1;}
116 if ($options=~s/^\s*([^-]\S*) //) {$outfile=$1;}
117
118 # Warn if unknown options found or no infile/outfile
119 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
120 if (!$infile) {print($help); exit(1);}
121
122 my $v2 = $v-1;
123 if ($v2>2) {$v2--;}
124 if ($v2<0) {$v2=0;}
125
126 if ($informat eq "hmm" && !$outfile) {
127 print("Error: no output file given. With the -hmm option an output file is obligatory\n"); exit(1);
128 }
129
130 ###############################################################################################
131 # Reformat input alignment to a3m and psiblast-readable format and generate file with query sequence
132 ###############################################################################################
133
134 my $inbase; # $inbasename of infile: remove extension
135 my $inroot; # $inbasename of infile: remove path and extension
136 if ($infile=~/(.*)\..*/) {$inbase=$1;} else {$inbase=$infile;} # remove extension
137 if ($inbase=~/.*\/(.*)/) {$inroot=$1;} else {$inroot=$inbase;} # remove path
138
139 # Create tmpfile
140 my $tmpdir;
141 if ($v<=3) {$tmpdir = tempdir( CLEANUP => 1);} else {$tmpdir = tempdir( CLEANUP => 0);}
142 my ($tmpf, $tmpfile) = tempfile( DIR => $tmpdir );
143 my $tmpfile_no_dir;
144 if ($tmpfile=~/.*\/(.*)/) {$tmpfile_no_dir=$1;} else {$tmpfile_no_dir=$tmpfile;} # remove path
145
146
147
148 ############################################################################################
149
150 if ($informat ne "hmm") {
151 if (!$outfile) {$outfile="$inbase.a3m";}
152
153 # Use first sequence to define match states and reformat input file to a3m and psi
154 if ($informat ne "a3m") {
155 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -M first $informat a3m $infile $tmpfile.in.a3m");
156 } else {
157 &HHPaths::System("cp $infile $tmpfile.in.a3m");
158 }
159
160 # Read query sequence
161 open (INFILE, "<$tmpfile.in.a3m") or die ("ERROR: cannot open $tmpfile.in.a3m!\n");
162 $/=">"; # set input field separator
163 my $i=0;
164 $qseq="";
165 $header = <INFILE>;
166 $header =~s />$//;
167 while ($line=<INFILE>) {
168 $line=~s/>$//;
169 if ($line=~/^ss_/ || $line=~/^aa_/) {next;}
170 $seqs[$i++]=">$line";
171 if(!$qseq) {
172 $line=~s/^(.*)[^\n]*//;
173 $name=$1;
174 $qseq=$line;
175 $qseq=~s/\n//g;
176 }
177 }
178 close(INFILE);
179 $/="\n"; # set input field separator
180
181 if ($qseq =~ /\-/) {
182
183 # First sequence contains gaps => calculate consensus sequence
184 &HHPaths::System("hhconsensus -i $tmpfile.in.a3m -s $tmpfile.sq -o $tmpfile.in.a3m > /dev/null");
185
186 } else {
187
188 $query_length = ($qseq=~tr/A-Z/A-Z/);
189 $qseq=~tr/A-Z//cd; # remove everything except capital letters
190
191 # Write query sequence file in FASTA format
192 open (QFILE, ">$tmpfile.sq") or die("ERROR: can't open $tmpfile.sq: $!\n");
193 printf(QFILE ">%s\n%s\n",$name,$qseq);
194 close (QFILE);
195 }
196
197 # Filter alignment to diversity $neff
198 if ($v>=1) {printf ("Filtering alignment to diversity $neff ...\n");}
199 &HHPaths::System("hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m");
200
201 # Reformat into PSI-BLAST readable file for jumpstarting
202 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -r -noss a3m psi $tmpfile.in.a3m $tmpfile.in.psi");
203
204 open (ALIFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
205 printf (ALIFILE "%s",$header);
206
207 # Add DSSP sequence (if available)
208 if ($dssp ne "") {
209 if (!&AppendDsspSequences("$tmpfile.sq")) {
210 if ($numres) {
211 $ss_dssp=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
212 }
213 printf (ALIFILE ">ss_dssp\n%s\n",$ss_dssp);
214 if ($v>=1) {print("\nAdding DSSP state sequence ...\n");}
215 }
216 }
217
218 # Secondary structure prediction with psipred
219 if ($v>=2) {print("Predicting secondary structure with PSIPRED ... ");}
220 &RunPsipred("$tmpfile.sq");
221
222 if (open (PSIPREDFILE, "<$tmpfile.horiz")) {
223 $ss_conf="";
224 $ss_pred="";
225 # Read Psipred file
226 while ($line=<PSIPREDFILE>) {
227 if ($line=~/^Conf:\s+(\S+)/) {$ss_conf.=$1;}
228 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
229 }
230 close(PSIPREDFILE);
231 $ss_conf=~tr/0-9/0/c; # replace all non-numerical symbols with a 0
232 if ($numres) {
233 $ss_pred=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
234 $ss_conf=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
235 }
236 printf(ALIFILE ">ss_pred PSIPRED predicted secondary structure\n%s\n",$ss_pred);
237 printf(ALIFILE ">ss_conf PSIPRED confidence values\n%s\n",$ss_conf);
238 }
239
240 # Append alignment sequences to psipred sequences
241 for ($i=0; $i<@seqs; $i++) {
242 printf(ALIFILE "%s",$seqs[$i]);
243 }
244 close(ALIFILE);
245 if ($v>=2) {print("done \n");}
246 }
247 ##############################################################
248 # HMMER format
249 else
250 {
251 if (!$outfile) {$outfile="$inbase.hmm";}
252
253 my $log2 = log(2);
254 my @logoddsmat;
255 my @lines;
256 my $length;
257 my $query;
258 my $scale=0.13; # empirically determined scale factor between HMMER bit score and PSI-BLAST score, 0.3 for HMMER3
259 my $acc;
260 my $name;
261 my $desc;
262 my $nmodels=0;
263
264 open (INFILE, "<$infile") || die("ERROR: cannot open $infile: $!\n");
265 open (OUTFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
266
267 # Read HMMER file model by model
268 while ($line=<INFILE>) {
269 # Search for start of next model
270 while ($line && $line!~/^HMMER/ && $line!~/^NAME /) {
271 $line=<INFILE>;
272 }
273 if ($line=~/^HMMER3/) {
274
275 $scale = 0.3;
276 @logoddsmat=();
277 @lines=($line);
278
279 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
280 $line=~/^LENG\s+(\d+)/;
281 $length=$1; # number of match states in HMM
282 $query=""; # query residues from NULL emission lines
283 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
284 push(@lines,$line=<INFILE>);
285 if ($line !~ /^\s*COMPO/) {
286 die("Error: need null-model probablities (Parameter COMPO)!\n");
287 }
288 $line=~s/^\s*COMPO\s+(\S.*\S)\s*$/$1/;
289 my @nullmodel = split(/\s+/,$line);
290 @nullmodel = map {$_ = exp(-1 * $_)} @nullmodel; # Transform to probabilities
291 push(@lines,$line=<INFILE>); # state 0 insert emission
292 push(@lines,$line=<INFILE>); # transisitions from begin state
293
294 while ($line=<INFILE>) {
295 push(@lines,$line);
296 if ($line=~/^\/\//) {last;}
297 $line=~s/^\s*\d+\s+(\S.*\S)\s+\d+\s+(\S)\s+\S\s*$/$1/;
298 $query .= $2;
299 my @probs = split(/\s+/,$line);
300 @probs = map {$_ = exp(-1 * $_)} @probs; # Transform to probabilities
301 # calculate log-odds
302 my @logodds = ();
303 for (my $a = 0; $a < scalar(@probs); $a++) {
304 my $logodd = (log($probs[$a] / $nullmodel[$a]) / $log2) * 1000;
305 push(@logodds, $logodd);
306 }
307
308 push(@logoddsmat,\@logodds);
309 push(@lines,$line=<INFILE>);
310 push(@lines,$line=<INFILE>);
311 }
312
313 } else {
314
315 $scale=0.13;
316 if ($line!~/^HMMER/ && $line!~/^NAME /) {last;} # first line in each model must begin with 'HMMER...'
317 @logoddsmat=();
318 @lines=($line);
319
320 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
321 $line=~/^LENG\s+(\d+)/;
322 $length=$1; # number of match states in HMM
323 $query=""; # query residues from NULL emission lines
324 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
325 push(@lines,$line=<INFILE>);
326 while ($line=<INFILE>) {
327 push(@lines,$line);
328 if ($line=~/^\/\//) {last;}
329 $line=~s/^\s*\d+\s+(\S.*\S)\s*$/$1/;
330 my @logodds = split(/\s+/,$line);
331 push(@logoddsmat,\@logodds);
332 push(@lines,$line=<INFILE>);
333 $line=~/^\s*(\S)/;
334 $query .= $1;
335 push(@lines,$line=<INFILE>);
336 }
337 }
338
339 # Write mtx matrix
340 open (MTXFILE, ">$tmpfile.mtx") || die("ERROR: cannot open $tmpfile.mtx: $!\n");
341 printf(MTXFILE "%i\n",$length);
342 printf(MTXFILE "%s\n",$query);
343 printf(MTXFILE "2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n");
344 while (@logoddsmat) {
345 my @logodds = @{shift(@logoddsmat)};
346 print(MTXFILE "-32768 ");
347 splice(@logodds, 1,0,-32768/$scale); # insert logodds value for B
348 splice(@logodds,20,0, -100/$scale); # insert logodds value for X
349 splice(@logodds,22,0,-32768/$scale); # insert logodds value for Z
350 for (my $i=0; $i<23; $i++) {
351 printf(MTXFILE "%4.0f ",$scale*$logodds[$i]);
352 }
353 print(MTXFILE "-32768 -400\n");
354 }
355 close(MTXFILE);
356
357 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
358 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
359 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
360 } else {
361 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
362 }
363
364 # READ PSIPRED file
365 if (open (PSIPRED, "$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss |")) {
366 $ss_conf="";
367 $ss_pred="";
368 # Read Psipred file
369 while ($line=<PSIPRED>) {
370 if ($line=~/^Conf:\s+(\d+)/) {$ss_conf.=$1;}
371 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
372 }
373 close(PSIPREDFILE);
374 }
375
376 # Add secondary structure to HMMER output file and print
377 foreach $line (@lines) {
378 if ($line=~/^SSPRD/ || $line=~/^SSCON/|| $line=~/^SSCIT/) {next;}
379 if ($line=~/^HMM /) {
380 $ss_pred=~s/(\S{80})/$1\nSSPRD /g; # insert a line break every 80 residues
381 $ss_conf=~s/(\S{80})/$1\nSSCON /g; # insert a line break every 80 residues
382 printf(OUTFILE "SSCIT HHsearch-readable PSIPRED secondary structure prediction:\n");
383 printf(OUTFILE "SSPRD %s\n",$ss_pred);
384 printf(OUTFILE "SSCON %s\n",$ss_conf);
385 printf(OUTFILE "SSCIT %s\n",$ss_cit);
386 }
387 printf(OUTFILE $line);
388 }
389 $nmodels++;
390 }
391
392 close(OUTFILE);
393 close(INFILE);
394 &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2");
395 if ($v>=2) {printf("Added PSIPRED secondary structure to %i models\n",$nmodels);}
396 }
397
398 if ($v<=3) {
399 unlink("$tmpfile.in.a3m");
400 unlink("$tmpfile.in.psi");
401 unlink("$tmpfile.horiz");
402 unlink("$tmpfile.dssp");
403 }
404
405 exit;
406
407 ##############################################################################################
408 # Run SS prediction starting from alignment in $tmpfile.in.psi (called by BuildAlignment)
409 ##############################################################################################
410 sub RunPsipred() {
411 # This is a simple script which will carry out all of the basic steps
412 # required to make a PSIPRED V2 prediction. Note that it assumes that the
413 # following programs are in the appropriate directories:
414 # blastpgp - PSIBLAST executable (from NCBI toolkit)
415 # makemat - IMPALA utility (from NCBI toolkit)
416 # psipred - PSIPRED V2 program
417 # psipass2 - PSIPRED V2 program
418
419 my $infile=$_[0];
420 my $basename; #file name without extension
421 my $rootname; #basename without directory path
422 if ($infile =~/^(.*)\..*?$/) {$basename=$1;} else {$basename=$infile;}
423 if ($basename=~/^.*\/(.*?)$/) {$rootname=$1;} else {$rootname=$basename;}
424
425 # Does dummy database exist?
426 if (!-e "$dummydb.phr") {
427 if (!-e "$dummydb") {die "Error in addss.pl: Could not find $dummydb\n";}
428
429 &HHPaths::System("cp $infile $dummydb");
430 &HHPaths::System("$ncbidir/formatdb -i $dummydb");
431 if (!-e "$dummydb.phr") {die "Error in addss.pl: Could not find nor create index files for $dummydb\n";}
432 }
433
434 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
435 &HHPaths::System("$ncbidir/blastpgp -b 1 -j 1 -h 0.001 -d $dummydb -i $infile -B $tmpfile.in.psi -C $tmpfile.chk 1> $tmpfile.blalog 2> $tmpfile.blalog");
436
437 #print("Predicting secondary structure...\n");
438
439 &HHPaths::System("echo "."$tmpfile_no_dir".".chk > $tmpfile.pn\n");
440 &HHPaths::System("echo "."$tmpfile_no_dir".".sq > $tmpfile.sn\n");
441 &HHPaths::System("$ncbidir/makemat -P $tmpfile");
442
443 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
444 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
445 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
446 } else {
447 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
448 }
449
450 &HHPaths::System("$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss > $tmpfile.horiz");
451
452 # Remove temporary files
453 if ($v<=3) { unlink(split ' ', "$tmpfile.pn $tmpfile.sn $tmpfile.mn $tmpfile.chk $tmpfile.blalog $tmpfile.mtx $tmpfile.aux $tmpfile.ss $tmpfile.ss2 $tmpfile.sq");}
454 return;
455 }
456
457 ##############################################################################################
458 # Read query sequence and extract dssp sequence
459 ##############################################################################################
460 sub AppendDsspSequences() {
461 my $qfile=$_[0];
462
463 my $line; #input line
464 my $name; #name of sequence in in file, e.g. d1g8ma1
465 my $qrange; #chain and residue range of query sequence
466 my $aas=""; #amino acids from in file for each $name
467
468 my $dsspfile;
469 my $pdbfile;
470 my $pdbcode; #pdb code for accessing dssp file; shortened from in code, e.g. 1g8m
471 my @ss_dssp=(); #dssp states for residues (H,E,L)
472 my @sa_dssp=(); #dssp states for residues (H,E,L)
473 my @aa_dssp=(); #residues in dssp file
474 my @aa_astr=(); #residues from infile
475 my $length; #length of sequence
476
477 # Default parameters for Align.pm
478 our $d=3; # gap opening penatlty for Align.pm
479 our $e=0.1; # gap extension penatlty for Align.pm
480 our $g=0.09; # endgap penatlty for Align.pm
481 our $matrix="identity";
482
483 # Read query sequence -> $name, $nameline, $range, $aas
484 open (QFILE, "<$qfile") || die ("cannot open $qfile: $!");
485 while ($line=<QFILE>) {
486 if ($line=~/>(\S+)/) {
487 $name=$1;
488
489 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1)
490 if ($line=~/^>[defgh](\d[a-z0-9]{3})[a-z0-9_.][a-z0-9_]\s+[a-z]\.\d+\.\d+\.\d+\s+\((\S+)\)/) {
491 $pdbcode=$1;
492 $qrange=$2;
493 }
494
495 # PDB ID? (8fab_A, 1a0i)
496 elsif ($line=~/^>(\d[a-z0-9]{3})_?(\S?)\s/) {
497 $pdbcode=$1;
498 if ($2 ne "") {$qrange="$2:";} else {$qrange="-";}
499 }
500
501 # DALI ID? (8fabA_0,1a0i_2)
502 elsif ($line=~/^>(\d[a-z0-9]{3})[A-Za-z0-9]?_\d+\s+\d+\.\d+.\d+.\d+.\d+.\d+\s+\((\S+)\)/) {
503 $pdbcode=$1;
504 $qrange=$2;
505 }
506
507 else {
508 if ($v>=3) {print("Warning: no pdb code found in sequence name '$name'\n");}
509 close(QFILE);
510 return 1; # no astral/DALI/pdb sequence => no dssp states available
511 }
512 $aas="";
513
514 }
515 else
516 {
517 chomp($line);
518 $line=~tr/a-z \t/A-Z/d;
519 $aas.=$line;
520 }
521 }
522 close(QFILE);
523 if ($v>=3) {printf("Searching DSSP state assignments: name=%s range=%s\n",$name,$qrange);}
524
525 # Try to open dssp file
526 $dsspfile="$dsspdir/$pdbcode.dssp";
527 if (! open (DSSPFILE, "<$dsspfile")) {
528 if ($v>=3) {printf(STDERR "Warning in $program: Cannot open $dsspfile!\n");}
529 $pdbfile = &OpenPDBfile($pdbcode);
530 if ($pdbfile eq "") {return;}
531
532 system("$dssp $pdbfile $tmpfile.dssp 2> /dev/null");
533 system("cp $tmpfile.dssp $dsspfile 2> /dev/null");
534 $dsspfile="$tmpfile.dssp";
535 if (! open (DSSPFILE, "<$dsspfile")) {
536 if ($v>=3) {printf(STDERR "Warning in $program: dssp couldn't generate file from $pdbfile. Skipping $name\n");}
537 return 1;
538 }
539 }
540
541 #....+....1....+....2....+....3....+....4
542 # # RESIDUE AA STRUCTURE BP1 BP2 ACC etc.
543 # 623 630 A R < 0 0 280 etc.
544 # 624 !* 0 0 0 etc.
545 # 625 8 B A 0 0 105 etc.
546 # 626 9 B P >> - 0 0 71 etc.
547 # 292 28SA K H 4 S+ 0 0 71 etc. (1qdm.dssp)
548 # 293 29SA K H > S+ 0 0 28 etc.
549
550 # Read in whole DSSP file
551 for (my $try = 1; $try<=2; $try++) {
552 $aa_dssp="";
553 $sa_dssp="";
554 $ss_dssp="";
555 while ($line=<DSSPFILE>) {if ($line=~/^\s*\#\s*RESIDUE\s+AA/) {last;}}
556 while ($line=<DSSPFILE>)
557 {
558 if ($line=~/^.{5}(.{5})(.)(.)\s(.).\s(.).{18}(...)/)
559 {
560 my $thisres=$1;
561 my $icode=$2;
562 my $chain=$3;
563 my $aa=$4;
564 my $ss=$5;
565 my $sa=$6;
566 my $contained=0;
567 my $range=$qrange;
568 if ($aa eq "!") {next;} # missing residues!
569 $thisres=~tr/ //d;
570 $chain=~tr/ //d;
571 $icode=~tr/ //d;
572 $sa=~tr/ //d;
573 if ($try==1) {
574 do{
575 if ($range=~s/^(\S):(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq $1 && $icode eq $4 && $2<=$thisres && $thisres<=$3) {
576 $contained=1; #syntax (A:56S-135S)
577 }
578 elsif ($range=~s/^(\S):(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq $1 && $2<=$thisres && $thisres<=$3) {
579 $contained=1; #syntax (R:56-135)
580 }
581 elsif ($range=~s/^(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq "" && $icode eq $3 && $1<=$thisres && $thisres<=$2) {
582 $contained=1; #syntax (56-135)
583 }
584 elsif ($range=~s/^(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq "" && $1<=$thisres && $thisres<=$2) {
585 $contained=1; #syntax (56-135)
586 }
587 elsif ($range=~s/^(\S):// && $chain eq $1) {
588 $contained=1; #syntax (A:) or (A:,2:)
589 }
590 elsif ($range=~s/^-$// && $chain eq "") {
591 $contained=1; #syntax (-)
592 }
593 $range=~s/^,//;
594 # print("qrange=$qrange range='$range' ires=$thisres chain=$chain contained=$contained\n");
595 } while($contained==0 && $range ne "");
596 if ($contained==0) {next;}
597 } # end if try==1
598 $aa_dssp.=$aa;
599 $ss_dssp.=$ss;
600 $sa_dssp.=&sa2c($sa,$aa);
601 }
602 }
603 # if not enough residues were found: chain id is wrong => repeat extraction without checking chain id
604 if (length($aa_dssp)>=10) {last;}
605 close(DSSPFILE);
606 open (DSSPFILE, "<$dsspfile");
607 }
608 close(DSSPFILE);
609
610 if (length($aa_dssp)==0) {print("WARNING: no residues found in $dsspdir/$pdbcode.dssp\n"); return 1;}
611 if (length($aa_dssp)<=20) {printf("WARNING: only %i residues found in $dsspdir/$pdbcode.dssp\n",length($aa_dssp)); return 1;}
612
613 # Postprocess $aa_dssp etc
614 $aa_dssp =~ tr/a-z/CCCCCCCCCCCCCCCCCCCCCCCCCC/;
615 $ss_dssp =~ tr/ I/CC/;
616 $ss_dssp =~ s/ \S / /g;
617 $ss_dssp =~ s/ \S\S / /g;
618
619 # Align query with dssp sequence
620 $aa_astr = $aas;
621 $xseq=$aas;
622 $yseq=$aa_dssp;
623 my ($imax,$imin,$jmax,$jmin);
624 my (@i,@j);
625 my $score=&AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr);
626
627 # Initialize strings (=arrays) for dssp states with "----...-"
628 my @ss_dssp_ali=(); # $ss_dssp_ali[$i] is dssp state aligned to $aa_astr[$i]
629 my @sa_dssp_ali=(); # $sa_dssp_ali[$i] is solvent accessibility string
630 my @aa_dssp_ali=(); # $aa_dssp_ali[$i] is dssp residue aligned to $aa_astr[$i]
631 for (my $i=0; $i<=length($aa_astr); $i++) { # sum up to len+1
632 # because 0'th element in @ss_dssp and @aa_dssp is dummy "-"
633 $ss_dssp_ali[$i]="-";
634 $sa_dssp_ali[$i]="-";
635 $aa_dssp_ali[$i]="-";
636 }
637
638 # To each residue (from i=0 to len-1) of input sequence $aa_astr assign aligned dssp state
639 @ss_dssp = split(//,$ss_dssp);
640 @sa_dssp = split(//,$sa_dssp);
641 @aa_dssp = split(//,$aa_dssp);
642 @aa_astr = split(//,$aa_astr);
643 my $len = 0;
644 unshift(@aa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
645 unshift(@ss_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
646 unshift(@sa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
647 unshift(@aa_astr,"-"); #add a gap symbol at beginning -> first residue is at 1!
648 for (my $col=0; $col<@i; $col++) {
649 if ($i[$col]>0) {
650 if ($j[$col]>0) {$len++;} # count match states (for score/len calculation)
651 $ss_dssp_ali[$i[$col]]=$ss_dssp[$j[$col]];
652 $sa_dssp_ali[$i[$col]]=$sa_dssp[$j[$col]];
653 $aa_dssp_ali[$i[$col]]=$aa_dssp[$j[$col]];
654 }
655 if ($v>=4) {
656 printf ("%s %3i %s %3i\n",$aa_astr[$i[$col]],$i[$col],$aa_dssp[$j[$col]],$j[$col]);
657 }
658 }
659 shift (@ss_dssp_ali); # throw out first "-"
660 shift (@sa_dssp_ali); # throw out first "-"
661 shift (@aa_dssp_ali); # throw out first "-"
662 $aa_dssp=join("",@aa_dssp_ali);
663 $ss_dssp=join("",@ss_dssp_ali);
664 $sa_dssp=join("",@sa_dssp_ali);
665
666 # Debugging output
667 if ($v>=4) {printf(STDOUT "DSSP: %s: length=%-3i score/len:%-5.3f\n",$name,$len,$score/$len);}
668 if ($v>=4) {
669 printf("IN: %s\n",$xseq);
670 printf("MATCH: %s\n",$Sstr);
671 printf("DSSP: %s\n",$yseq);
672 printf("\n");
673 printf(">ss_dssp $name\n$ss_dssp\n");
674 printf(">sa_dssp $name\n$sa_dssp\n");
675 printf(">aa_dssp $name\n$aa_dssp\n");
676 printf(">aa_astra $name\n$aa_astr\n\n");
677 }
678 if ($score/$len<0.5) {
679 printf (STDOUT "\nWARNING: in $name: alignment score with dssp residues too low: Score/len=%f.\n\n",$score/$len);
680 printf("IN: %s\n",$xseq);
681 printf("MATCH: %s\n",$Sstr);
682 printf("DSSP: %s\n",$yseq);
683 return 1;
684 }
685
686 return 0;
687 }
688
689 ################################################################################################
690 ### Return solvent accessibility code
691 ################################################################################################
692 sub sa2c ()
693 {
694 my %maxsa = (A=>106, B=>160, C=>135, D=>163, E=>194, F=>197, G=>84, H=>184, I=>169, K=>205, L=>164, M=>188,
695 N=>157, P=>136, Q=>198, R=>248, S=>130, T=>142, V=>142, W=>227, X=>180, Y=>222, Z=>196); # maximum solvent accessiblity
696 if ($_[1]=~/[a-z]/) {return "F";} # disulphide bridge
697 if (!defined $maxsa{$_[1]}) {return "-";} # no amino acid
698 my $rsa=$_[0]/$maxsa{$_[1]};
699 # printf("aa=%s sa=%5.1f max_sa=%5.1f rsa=%5.3f\n",$_[1],$_[0],$maxsa{$_[1]},$rsa);
700 if ($rsa<=0.02) {return "A";}
701 elsif ($rsa<=0.14) {return "B";}
702 elsif ($rsa<=0.33) {return "C";}
703 elsif ($rsa<=0.55) {return "D";}
704 else {return "E";}
705 }
706
707 # Find the pdb file with $pdbcode in pdb directory
708 sub OpenPDBfile() {
709
710 my $pdbcode=lc($_[0]);
711 if (! -e "$pdbdir") {
712 if ($v>=3) {print(STDERR "Warning in $program: pdb directory '$pdbdir' does not exist!\n");}
713 return 1;
714 }
715 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";}
716 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";}
717 else {$pdbfile="$pdbdir/";}
718 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";}
719 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";}
720 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";}
721 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";}
722 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";}
723 else {
724 if ($v>=3) {printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");}
725 return "";
726 }
727 if (!open (PDBFILE, "$pdbfile")) {
728 if ($v>=3) {printf(STDERR "Error in $program: Cannot open pdb file: $!\n");}
729 return "";
730 }
731 return $pdbfile;
732 }