6
|
1 #!/usr/bin/env perl
|
|
2 #
|
|
3 # addss.pl
|
|
4 # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file.
|
|
5 # Output format is A3M (for input alignments) or HMMER (see User Guide).
|
|
6
|
|
7
|
|
8 # HHsuite version 2.0.16 (January 2013)
|
|
9 #
|
|
10 # Reference:
|
|
11 # Remmert M., Biegert A., Hauser A., and Soding J.
|
|
12 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
|
|
13 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
|
|
14
|
|
15 # (C) Johannes Soeding and Michael Remmert, 2012
|
|
16
|
|
17 # This program is free software: you can redistribute it and/or modify
|
|
18 # it under the terms of the GNU General Public License as published by
|
|
19 # the Free Software Foundation, either version 3 of the License, or
|
|
20 # (at your option) any later version.
|
|
21
|
|
22 # This program is distributed in the hope that it will be useful,
|
|
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
25 # GNU General Public License for more details.
|
|
26
|
|
27 # You should have received a copy of the GNU General Public License
|
|
28 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
29
|
|
30 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
|
|
31
|
|
32 use lib $ENV{"HHLIB"}."/scripts";
|
|
33 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
|
|
34 use Align; # Needleman-Wunsch and Smith-Waterman alignment functions
|
|
35 use File::Temp qw/ tempfile tempdir /;
|
|
36 use strict;
|
|
37
|
|
38 my $ss_cit="PSIPRED: Jones DT. (1999) Protein secondary structure prediction based on position-specific scoring matrices. JMB 292:195-202.";
|
|
39
|
|
40
|
|
41 # Module needed for aligning DSSP-sequence
|
|
42
|
|
43 $|= 1; # Activate autoflushing on STDOUT
|
|
44
|
|
45 # Default values:
|
|
46 our $v=2; # verbose mode
|
|
47
|
|
48 my $numres=0; # number of residues per line for secondary structure
|
|
49 my $informat="a3m"; # input format
|
|
50 my $neff = 7; # use alignment with this diversity for PSIPRED prediction
|
|
51 my $program=$0; # name of perl script
|
|
52 my $pdbfile;
|
|
53
|
|
54 my $help="
|
|
55 addss.pl from HHsuite $VERSION
|
|
56 Add PSIPRED secondary structure prediction (and DSSP annotation) to a multiple sequence alignment (MSA)
|
|
57 or HMMER (multi-)model file.
|
|
58
|
|
59 If the input file is an MSA, the predicted secondary structure and confidence values are added as
|
|
60 special annotation sequences with names >ss_pred, >ss_conf, and >ss_dssp to the top of the output
|
|
61 A3M alignment. If no output file is given, the output file will have the same name as the input file,
|
|
62 except for the extension being replaced by '.a3m'. Allowed input formats are A3M (default),
|
|
63 A2M/FASTA (-fas, -a2m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm).
|
|
64
|
|
65 If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary
|
|
66 structure and confidence values are added to each model. In this case the output file name is
|
|
67 obligatory and must be different from the input file name.
|
|
68
|
|
69 Usage: perl addss.pl <ali_file> [<outfile>] [-fas|-a3m|-clu|-sto]
|
|
70 or perl addss.pl <hhm_file> <outfile> -hmm
|
|
71 \n";
|
|
72
|
|
73 # Variable declarations
|
|
74 my $line;
|
|
75 my @seqs; # sequences from infile (except >aa_ and >ss_pred sequences)
|
|
76 my $query_length;
|
|
77 my $header; # header of MSA: everything before first '>'
|
|
78 my $name; # query in fasta format: '>$name [^\n]*\n$qseq\n'
|
|
79 my $qseq; # residues of query sequence
|
|
80 my $infile;
|
|
81 my $outfile;
|
|
82 my $ss_pred=""; # psipred ss states
|
|
83 my $ss_conf=""; # psipred confidence values
|
|
84 my $ss_dssp; # dssp states as string
|
|
85 my $sa_dssp; # relative solvent accessibility from dssp as string {A,B,C,D,E} A:absolutely buried, B:buried, E:exposed
|
|
86 my $aa_dssp; # residues from dssp file as string
|
|
87 my $aa_astr; # residues from infile as string
|
|
88 my $q_match; # number of match states in query sequence
|
|
89 my $xseq; # sequence x returned from Align.pm
|
|
90 my $yseq; # sequence y returned from Align.pm
|
|
91 my $Sstr; # match sequence returned from Align.pm
|
|
92
|
|
93 ###############################################################################################
|
|
94 # Processing command line input
|
|
95 ###############################################################################################
|
|
96
|
|
97 if (@ARGV<1) {die ($help);}
|
|
98
|
|
99 my $options="";
|
|
100 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
|
|
101
|
|
102 #Input format fasta?
|
|
103 if ($options=~s/ -fas\s/ /g) {$informat="fas";}
|
|
104 elsif ($options=~s/ -a2m\s/ /g) {$informat="a2m";}
|
|
105 elsif ($options=~s/ -a3m\s/ /g) {$informat="a3m";}
|
|
106 elsif ($options=~s/ -clu\s/ /g) {$informat="clu";}
|
|
107 elsif ($options=~s/ -sto\s/ /g) {$informat="sto";}
|
|
108 elsif ($options=~s/ -hmm\s/ /g) {$informat="hmm";}
|
|
109
|
|
110 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;}
|
|
111
|
|
112 # Set input and output file
|
|
113 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
|
|
114 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
|
|
115 if ($options=~s/^\s*([^-]\S*) //) {$infile=$1;}
|
|
116 if ($options=~s/^\s*([^-]\S*) //) {$outfile=$1;}
|
|
117
|
|
118 # Warn if unknown options found or no infile/outfile
|
|
119 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
|
|
120 if (!$infile) {print($help); exit(1);}
|
|
121
|
|
122 my $v2 = $v-1;
|
|
123 if ($v2>2) {$v2--;}
|
|
124 if ($v2<0) {$v2=0;}
|
|
125
|
|
126 if ($informat eq "hmm" && !$outfile) {
|
|
127 print("Error: no output file given. With the -hmm option an output file is obligatory\n"); exit(1);
|
|
128 }
|
|
129
|
|
130 ###############################################################################################
|
|
131 # Reformat input alignment to a3m and psiblast-readable format and generate file with query sequence
|
|
132 ###############################################################################################
|
|
133
|
|
134 my $inbase; # $inbasename of infile: remove extension
|
|
135 my $inroot; # $inbasename of infile: remove path and extension
|
|
136 if ($infile=~/(.*)\..*/) {$inbase=$1;} else {$inbase=$infile;} # remove extension
|
|
137 if ($inbase=~/.*\/(.*)/) {$inroot=$1;} else {$inroot=$inbase;} # remove path
|
|
138
|
|
139 # Create tmpfile
|
|
140 my $tmpdir;
|
|
141 if ($v<=3) {$tmpdir = tempdir( CLEANUP => 1);} else {$tmpdir = tempdir( CLEANUP => 0);}
|
|
142 my ($tmpf, $tmpfile) = tempfile( DIR => $tmpdir );
|
|
143 my $tmpfile_no_dir;
|
|
144 if ($tmpfile=~/.*\/(.*)/) {$tmpfile_no_dir=$1;} else {$tmpfile_no_dir=$tmpfile;} # remove path
|
|
145
|
|
146
|
|
147
|
|
148 ############################################################################################
|
|
149
|
|
150 if ($informat ne "hmm") {
|
|
151 if (!$outfile) {$outfile="$inbase.a3m";}
|
|
152
|
|
153 # Use first sequence to define match states and reformat input file to a3m and psi
|
|
154 if ($informat ne "a3m") {
|
|
155 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -M first $informat a3m $infile $tmpfile.in.a3m");
|
|
156 } else {
|
|
157 &HHPaths::System("cp $infile $tmpfile.in.a3m");
|
|
158 }
|
|
159
|
|
160 # Read query sequence
|
|
161 open (INFILE, "<$tmpfile.in.a3m") or die ("ERROR: cannot open $tmpfile.in.a3m!\n");
|
|
162 $/=">"; # set input field separator
|
|
163 my $i=0;
|
|
164 $qseq="";
|
|
165 $header = <INFILE>;
|
|
166 $header =~s />$//;
|
|
167 while ($line=<INFILE>) {
|
|
168 $line=~s/>$//;
|
|
169 if ($line=~/^ss_/ || $line=~/^aa_/) {next;}
|
|
170 $seqs[$i++]=">$line";
|
|
171 if(!$qseq) {
|
|
172 $line=~s/^(.*)[^\n]*//;
|
|
173 $name=$1;
|
|
174 $qseq=$line;
|
|
175 $qseq=~s/\n//g;
|
|
176 }
|
|
177 }
|
|
178 close(INFILE);
|
|
179 $/="\n"; # set input field separator
|
|
180
|
|
181 if ($qseq =~ /\-/) {
|
|
182
|
|
183 # First sequence contains gaps => calculate consensus sequence
|
|
184 &HHPaths::System("hhconsensus -i $tmpfile.in.a3m -s $tmpfile.sq -o $tmpfile.in.a3m > /dev/null");
|
|
185
|
|
186 } else {
|
|
187
|
|
188 $query_length = ($qseq=~tr/A-Z/A-Z/);
|
|
189 $qseq=~tr/A-Z//cd; # remove everything except capital letters
|
|
190
|
|
191 # Write query sequence file in FASTA format
|
|
192 open (QFILE, ">$tmpfile.sq") or die("ERROR: can't open $tmpfile.sq: $!\n");
|
|
193 printf(QFILE ">%s\n%s\n",$name,$qseq);
|
|
194 close (QFILE);
|
|
195 }
|
|
196
|
|
197 # Filter alignment to diversity $neff
|
|
198 if ($v>=1) {printf ("Filtering alignment to diversity $neff ...\n");}
|
|
199 &HHPaths::System("hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m");
|
|
200
|
|
201 # Reformat into PSI-BLAST readable file for jumpstarting
|
|
202 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -r -noss a3m psi $tmpfile.in.a3m $tmpfile.in.psi");
|
|
203
|
|
204 open (ALIFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
|
|
205 printf (ALIFILE "%s",$header);
|
|
206
|
|
207 # Add DSSP sequence (if available)
|
|
208 if ($dssp ne "") {
|
|
209 if (!&AppendDsspSequences("$tmpfile.sq")) {
|
|
210 if ($numres) {
|
|
211 $ss_dssp=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
|
|
212 }
|
|
213 printf (ALIFILE ">ss_dssp\n%s\n",$ss_dssp);
|
|
214 if ($v>=1) {print("\nAdding DSSP state sequence ...\n");}
|
|
215 }
|
|
216 }
|
|
217
|
|
218 # Secondary structure prediction with psipred
|
|
219 if ($v>=2) {print("Predicting secondary structure with PSIPRED ... ");}
|
|
220 &RunPsipred("$tmpfile.sq");
|
|
221
|
|
222 if (open (PSIPREDFILE, "<$tmpfile.horiz")) {
|
|
223 $ss_conf="";
|
|
224 $ss_pred="";
|
|
225 # Read Psipred file
|
|
226 while ($line=<PSIPREDFILE>) {
|
|
227 if ($line=~/^Conf:\s+(\S+)/) {$ss_conf.=$1;}
|
|
228 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
|
|
229 }
|
|
230 close(PSIPREDFILE);
|
|
231 $ss_conf=~tr/0-9/0/c; # replace all non-numerical symbols with a 0
|
|
232 if ($numres) {
|
|
233 $ss_pred=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
|
|
234 $ss_conf=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
|
|
235 }
|
|
236 printf(ALIFILE ">ss_pred PSIPRED predicted secondary structure\n%s\n",$ss_pred);
|
|
237 printf(ALIFILE ">ss_conf PSIPRED confidence values\n%s\n",$ss_conf);
|
|
238 }
|
|
239
|
|
240 # Append alignment sequences to psipred sequences
|
|
241 for ($i=0; $i<@seqs; $i++) {
|
|
242 printf(ALIFILE "%s",$seqs[$i]);
|
|
243 }
|
|
244 close(ALIFILE);
|
|
245 if ($v>=2) {print("done \n");}
|
|
246 }
|
|
247 ##############################################################
|
|
248 # HMMER format
|
|
249 else
|
|
250 {
|
|
251 if (!$outfile) {$outfile="$inbase.hmm";}
|
|
252
|
|
253 my $log2 = log(2);
|
|
254 my @logoddsmat;
|
|
255 my @lines;
|
|
256 my $length;
|
|
257 my $query;
|
|
258 my $scale=0.13; # empirically determined scale factor between HMMER bit score and PSI-BLAST score, 0.3 for HMMER3
|
|
259 my $acc;
|
|
260 my $name;
|
|
261 my $desc;
|
|
262 my $nmodels=0;
|
|
263
|
|
264 open (INFILE, "<$infile") || die("ERROR: cannot open $infile: $!\n");
|
|
265 open (OUTFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
|
|
266
|
|
267 # Read HMMER file model by model
|
|
268 while ($line=<INFILE>) {
|
|
269 # Search for start of next model
|
|
270 while ($line && $line!~/^HMMER/ && $line!~/^NAME /) {
|
|
271 $line=<INFILE>;
|
|
272 }
|
|
273 if ($line=~/^HMMER3/) {
|
|
274
|
|
275 $scale = 0.3;
|
|
276 @logoddsmat=();
|
|
277 @lines=($line);
|
|
278
|
|
279 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
|
|
280 $line=~/^LENG\s+(\d+)/;
|
|
281 $length=$1; # number of match states in HMM
|
|
282 $query=""; # query residues from NULL emission lines
|
|
283 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
|
|
284 push(@lines,$line=<INFILE>);
|
|
285 if ($line !~ /^\s*COMPO/) {
|
|
286 die("Error: need null-model probablities (Parameter COMPO)!\n");
|
|
287 }
|
|
288 $line=~s/^\s*COMPO\s+(\S.*\S)\s*$/$1/;
|
|
289 my @nullmodel = split(/\s+/,$line);
|
|
290 @nullmodel = map {$_ = exp(-1 * $_)} @nullmodel; # Transform to probabilities
|
|
291 push(@lines,$line=<INFILE>); # state 0 insert emission
|
|
292 push(@lines,$line=<INFILE>); # transisitions from begin state
|
|
293
|
|
294 while ($line=<INFILE>) {
|
|
295 push(@lines,$line);
|
|
296 if ($line=~/^\/\//) {last;}
|
|
297 $line=~s/^\s*\d+\s+(\S.*\S)\s+\d+\s+(\S)\s+\S\s*$/$1/;
|
|
298 $query .= $2;
|
|
299 my @probs = split(/\s+/,$line);
|
|
300 @probs = map {$_ = exp(-1 * $_)} @probs; # Transform to probabilities
|
|
301 # calculate log-odds
|
|
302 my @logodds = ();
|
|
303 for (my $a = 0; $a < scalar(@probs); $a++) {
|
|
304 my $logodd = (log($probs[$a] / $nullmodel[$a]) / $log2) * 1000;
|
|
305 push(@logodds, $logodd);
|
|
306 }
|
|
307
|
|
308 push(@logoddsmat,\@logodds);
|
|
309 push(@lines,$line=<INFILE>);
|
|
310 push(@lines,$line=<INFILE>);
|
|
311 }
|
|
312
|
|
313 } else {
|
|
314
|
|
315 $scale=0.13;
|
|
316 if ($line!~/^HMMER/ && $line!~/^NAME /) {last;} # first line in each model must begin with 'HMMER...'
|
|
317 @logoddsmat=();
|
|
318 @lines=($line);
|
|
319
|
|
320 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
|
|
321 $line=~/^LENG\s+(\d+)/;
|
|
322 $length=$1; # number of match states in HMM
|
|
323 $query=""; # query residues from NULL emission lines
|
|
324 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
|
|
325 push(@lines,$line=<INFILE>);
|
|
326 while ($line=<INFILE>) {
|
|
327 push(@lines,$line);
|
|
328 if ($line=~/^\/\//) {last;}
|
|
329 $line=~s/^\s*\d+\s+(\S.*\S)\s*$/$1/;
|
|
330 my @logodds = split(/\s+/,$line);
|
|
331 push(@logoddsmat,\@logodds);
|
|
332 push(@lines,$line=<INFILE>);
|
|
333 $line=~/^\s*(\S)/;
|
|
334 $query .= $1;
|
|
335 push(@lines,$line=<INFILE>);
|
|
336 }
|
|
337 }
|
|
338
|
|
339 # Write mtx matrix
|
|
340 open (MTXFILE, ">$tmpfile.mtx") || die("ERROR: cannot open $tmpfile.mtx: $!\n");
|
|
341 printf(MTXFILE "%i\n",$length);
|
|
342 printf(MTXFILE "%s\n",$query);
|
|
343 printf(MTXFILE "2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n");
|
|
344 while (@logoddsmat) {
|
|
345 my @logodds = @{shift(@logoddsmat)};
|
|
346 print(MTXFILE "-32768 ");
|
|
347 splice(@logodds, 1,0,-32768/$scale); # insert logodds value for B
|
|
348 splice(@logodds,20,0, -100/$scale); # insert logodds value for X
|
|
349 splice(@logodds,22,0,-32768/$scale); # insert logodds value for Z
|
|
350 for (my $i=0; $i<23; $i++) {
|
|
351 printf(MTXFILE "%4.0f ",$scale*$logodds[$i]);
|
|
352 }
|
|
353 print(MTXFILE "-32768 -400\n");
|
|
354 }
|
|
355 close(MTXFILE);
|
|
356
|
|
357 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
|
|
358 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
|
|
359 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
|
|
360 } else {
|
|
361 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
|
|
362 }
|
|
363
|
|
364 # READ PSIPRED file
|
|
365 if (open (PSIPRED, "$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss |")) {
|
|
366 $ss_conf="";
|
|
367 $ss_pred="";
|
|
368 # Read Psipred file
|
|
369 while ($line=<PSIPRED>) {
|
|
370 if ($line=~/^Conf:\s+(\d+)/) {$ss_conf.=$1;}
|
|
371 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
|
|
372 }
|
|
373 close(PSIPREDFILE);
|
|
374 }
|
|
375
|
|
376 # Add secondary structure to HMMER output file and print
|
|
377 foreach $line (@lines) {
|
|
378 if ($line=~/^SSPRD/ || $line=~/^SSCON/|| $line=~/^SSCIT/) {next;}
|
|
379 if ($line=~/^HMM /) {
|
|
380 $ss_pred=~s/(\S{80})/$1\nSSPRD /g; # insert a line break every 80 residues
|
|
381 $ss_conf=~s/(\S{80})/$1\nSSCON /g; # insert a line break every 80 residues
|
|
382 printf(OUTFILE "SSCIT HHsearch-readable PSIPRED secondary structure prediction:\n");
|
|
383 printf(OUTFILE "SSPRD %s\n",$ss_pred);
|
|
384 printf(OUTFILE "SSCON %s\n",$ss_conf);
|
|
385 printf(OUTFILE "SSCIT %s\n",$ss_cit);
|
|
386 }
|
|
387 printf(OUTFILE $line);
|
|
388 }
|
|
389 $nmodels++;
|
|
390 }
|
|
391
|
|
392 close(OUTFILE);
|
|
393 close(INFILE);
|
|
394 &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2");
|
|
395 if ($v>=2) {printf("Added PSIPRED secondary structure to %i models\n",$nmodels);}
|
|
396 }
|
|
397
|
|
398 if ($v<=3) {
|
|
399 unlink("$tmpfile.in.a3m");
|
|
400 unlink("$tmpfile.in.psi");
|
|
401 unlink("$tmpfile.horiz");
|
|
402 unlink("$tmpfile.dssp");
|
|
403 }
|
|
404
|
|
405 exit;
|
|
406
|
|
407 ##############################################################################################
|
|
408 # Run SS prediction starting from alignment in $tmpfile.in.psi (called by BuildAlignment)
|
|
409 ##############################################################################################
|
|
410 sub RunPsipred() {
|
|
411 # This is a simple script which will carry out all of the basic steps
|
|
412 # required to make a PSIPRED V2 prediction. Note that it assumes that the
|
|
413 # following programs are in the appropriate directories:
|
|
414 # blastpgp - PSIBLAST executable (from NCBI toolkit)
|
|
415 # makemat - IMPALA utility (from NCBI toolkit)
|
|
416 # psipred - PSIPRED V2 program
|
|
417 # psipass2 - PSIPRED V2 program
|
|
418
|
|
419 my $infile=$_[0];
|
|
420 my $basename; #file name without extension
|
|
421 my $rootname; #basename without directory path
|
|
422 if ($infile =~/^(.*)\..*?$/) {$basename=$1;} else {$basename=$infile;}
|
|
423 if ($basename=~/^.*\/(.*?)$/) {$rootname=$1;} else {$rootname=$basename;}
|
|
424
|
|
425 # Does dummy database exist?
|
|
426 if (!-e "$dummydb.phr") {
|
|
427 if (!-e "$dummydb") {die "Error in addss.pl: Could not find $dummydb\n";}
|
|
428
|
|
429 &HHPaths::System("cp $infile $dummydb");
|
|
430 &HHPaths::System("$ncbidir/formatdb -i $dummydb");
|
|
431 if (!-e "$dummydb.phr") {die "Error in addss.pl: Could not find nor create index files for $dummydb\n";}
|
|
432 }
|
|
433
|
|
434 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
|
|
435 &HHPaths::System("$ncbidir/blastpgp -b 1 -j 1 -h 0.001 -d $dummydb -i $infile -B $tmpfile.in.psi -C $tmpfile.chk 1> $tmpfile.blalog 2> $tmpfile.blalog");
|
|
436
|
|
437 #print("Predicting secondary structure...\n");
|
|
438
|
|
439 &HHPaths::System("echo "."$tmpfile_no_dir".".chk > $tmpfile.pn\n");
|
|
440 &HHPaths::System("echo "."$tmpfile_no_dir".".sq > $tmpfile.sn\n");
|
|
441 &HHPaths::System("$ncbidir/makemat -P $tmpfile");
|
|
442
|
|
443 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
|
|
444 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
|
|
445 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
|
|
446 } else {
|
|
447 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
|
|
448 }
|
|
449
|
|
450 &HHPaths::System("$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss > $tmpfile.horiz");
|
|
451
|
|
452 # Remove temporary files
|
|
453 if ($v<=3) { unlink(split ' ', "$tmpfile.pn $tmpfile.sn $tmpfile.mn $tmpfile.chk $tmpfile.blalog $tmpfile.mtx $tmpfile.aux $tmpfile.ss $tmpfile.ss2 $tmpfile.sq");}
|
|
454 return;
|
|
455 }
|
|
456
|
|
457 ##############################################################################################
|
|
458 # Read query sequence and extract dssp sequence
|
|
459 ##############################################################################################
|
|
460 sub AppendDsspSequences() {
|
|
461 my $qfile=$_[0];
|
|
462
|
|
463 my $line; #input line
|
|
464 my $name; #name of sequence in in file, e.g. d1g8ma1
|
|
465 my $qrange; #chain and residue range of query sequence
|
|
466 my $aas=""; #amino acids from in file for each $name
|
|
467
|
|
468 my $dsspfile;
|
|
469 my $pdbfile;
|
|
470 my $pdbcode; #pdb code for accessing dssp file; shortened from in code, e.g. 1g8m
|
|
471 my @ss_dssp=(); #dssp states for residues (H,E,L)
|
|
472 my @sa_dssp=(); #dssp states for residues (H,E,L)
|
|
473 my @aa_dssp=(); #residues in dssp file
|
|
474 my @aa_astr=(); #residues from infile
|
|
475 my $length; #length of sequence
|
|
476
|
|
477 # Default parameters for Align.pm
|
|
478 our $d=3; # gap opening penatlty for Align.pm
|
|
479 our $e=0.1; # gap extension penatlty for Align.pm
|
|
480 our $g=0.09; # endgap penatlty for Align.pm
|
|
481 our $matrix="identity";
|
|
482
|
|
483 # Read query sequence -> $name, $nameline, $range, $aas
|
|
484 open (QFILE, "<$qfile") || die ("cannot open $qfile: $!");
|
|
485 while ($line=<QFILE>) {
|
|
486 if ($line=~/>(\S+)/) {
|
|
487 $name=$1;
|
|
488
|
|
489 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1)
|
|
490 if ($line=~/^>[defgh](\d[a-z0-9]{3})[a-z0-9_.][a-z0-9_]\s+[a-z]\.\d+\.\d+\.\d+\s+\((\S+)\)/) {
|
|
491 $pdbcode=$1;
|
|
492 $qrange=$2;
|
|
493 }
|
|
494
|
|
495 # PDB ID? (8fab_A, 1a0i)
|
|
496 elsif ($line=~/^>(\d[a-z0-9]{3})_?(\S?)\s/) {
|
|
497 $pdbcode=$1;
|
|
498 if ($2 ne "") {$qrange="$2:";} else {$qrange="-";}
|
|
499 }
|
|
500
|
|
501 # DALI ID? (8fabA_0,1a0i_2)
|
|
502 elsif ($line=~/^>(\d[a-z0-9]{3})[A-Za-z0-9]?_\d+\s+\d+\.\d+.\d+.\d+.\d+.\d+\s+\((\S+)\)/) {
|
|
503 $pdbcode=$1;
|
|
504 $qrange=$2;
|
|
505 }
|
|
506
|
|
507 else {
|
|
508 if ($v>=3) {print("Warning: no pdb code found in sequence name '$name'\n");}
|
|
509 close(QFILE);
|
|
510 return 1; # no astral/DALI/pdb sequence => no dssp states available
|
|
511 }
|
|
512 $aas="";
|
|
513
|
|
514 }
|
|
515 else
|
|
516 {
|
|
517 chomp($line);
|
|
518 $line=~tr/a-z \t/A-Z/d;
|
|
519 $aas.=$line;
|
|
520 }
|
|
521 }
|
|
522 close(QFILE);
|
|
523 if ($v>=3) {printf("Searching DSSP state assignments: name=%s range=%s\n",$name,$qrange);}
|
|
524
|
|
525 # Try to open dssp file
|
|
526 $dsspfile="$dsspdir/$pdbcode.dssp";
|
|
527 if (! open (DSSPFILE, "<$dsspfile")) {
|
|
528 if ($v>=3) {printf(STDERR "Warning in $program: Cannot open $dsspfile!\n");}
|
|
529 $pdbfile = &OpenPDBfile($pdbcode);
|
|
530 if ($pdbfile eq "") {return;}
|
|
531
|
|
532 system("$dssp $pdbfile $tmpfile.dssp 2> /dev/null");
|
|
533 system("cp $tmpfile.dssp $dsspfile 2> /dev/null");
|
|
534 $dsspfile="$tmpfile.dssp";
|
|
535 if (! open (DSSPFILE, "<$dsspfile")) {
|
|
536 if ($v>=3) {printf(STDERR "Warning in $program: dssp couldn't generate file from $pdbfile. Skipping $name\n");}
|
|
537 return 1;
|
|
538 }
|
|
539 }
|
|
540
|
|
541 #....+....1....+....2....+....3....+....4
|
|
542 # # RESIDUE AA STRUCTURE BP1 BP2 ACC etc.
|
|
543 # 623 630 A R < 0 0 280 etc.
|
|
544 # 624 !* 0 0 0 etc.
|
|
545 # 625 8 B A 0 0 105 etc.
|
|
546 # 626 9 B P >> - 0 0 71 etc.
|
|
547 # 292 28SA K H 4 S+ 0 0 71 etc. (1qdm.dssp)
|
|
548 # 293 29SA K H > S+ 0 0 28 etc.
|
|
549
|
|
550 # Read in whole DSSP file
|
|
551 for (my $try = 1; $try<=2; $try++) {
|
|
552 $aa_dssp="";
|
|
553 $sa_dssp="";
|
|
554 $ss_dssp="";
|
|
555 while ($line=<DSSPFILE>) {if ($line=~/^\s*\#\s*RESIDUE\s+AA/) {last;}}
|
|
556 while ($line=<DSSPFILE>)
|
|
557 {
|
|
558 if ($line=~/^.{5}(.{5})(.)(.)\s(.).\s(.).{18}(...)/)
|
|
559 {
|
|
560 my $thisres=$1;
|
|
561 my $icode=$2;
|
|
562 my $chain=$3;
|
|
563 my $aa=$4;
|
|
564 my $ss=$5;
|
|
565 my $sa=$6;
|
|
566 my $contained=0;
|
|
567 my $range=$qrange;
|
|
568 if ($aa eq "!") {next;} # missing residues!
|
|
569 $thisres=~tr/ //d;
|
|
570 $chain=~tr/ //d;
|
|
571 $icode=~tr/ //d;
|
|
572 $sa=~tr/ //d;
|
|
573 if ($try==1) {
|
|
574 do{
|
|
575 if ($range=~s/^(\S):(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq $1 && $icode eq $4 && $2<=$thisres && $thisres<=$3) {
|
|
576 $contained=1; #syntax (A:56S-135S)
|
|
577 }
|
|
578 elsif ($range=~s/^(\S):(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq $1 && $2<=$thisres && $thisres<=$3) {
|
|
579 $contained=1; #syntax (R:56-135)
|
|
580 }
|
|
581 elsif ($range=~s/^(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq "" && $icode eq $3 && $1<=$thisres && $thisres<=$2) {
|
|
582 $contained=1; #syntax (56-135)
|
|
583 }
|
|
584 elsif ($range=~s/^(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq "" && $1<=$thisres && $thisres<=$2) {
|
|
585 $contained=1; #syntax (56-135)
|
|
586 }
|
|
587 elsif ($range=~s/^(\S):// && $chain eq $1) {
|
|
588 $contained=1; #syntax (A:) or (A:,2:)
|
|
589 }
|
|
590 elsif ($range=~s/^-$// && $chain eq "") {
|
|
591 $contained=1; #syntax (-)
|
|
592 }
|
|
593 $range=~s/^,//;
|
|
594 # print("qrange=$qrange range='$range' ires=$thisres chain=$chain contained=$contained\n");
|
|
595 } while($contained==0 && $range ne "");
|
|
596 if ($contained==0) {next;}
|
|
597 } # end if try==1
|
|
598 $aa_dssp.=$aa;
|
|
599 $ss_dssp.=$ss;
|
|
600 $sa_dssp.=&sa2c($sa,$aa);
|
|
601 }
|
|
602 }
|
|
603 # if not enough residues were found: chain id is wrong => repeat extraction without checking chain id
|
|
604 if (length($aa_dssp)>=10) {last;}
|
|
605 close(DSSPFILE);
|
|
606 open (DSSPFILE, "<$dsspfile");
|
|
607 }
|
|
608 close(DSSPFILE);
|
|
609
|
|
610 if (length($aa_dssp)==0) {print("WARNING: no residues found in $dsspdir/$pdbcode.dssp\n"); return 1;}
|
|
611 if (length($aa_dssp)<=20) {printf("WARNING: only %i residues found in $dsspdir/$pdbcode.dssp\n",length($aa_dssp)); return 1;}
|
|
612
|
|
613 # Postprocess $aa_dssp etc
|
|
614 $aa_dssp =~ tr/a-z/CCCCCCCCCCCCCCCCCCCCCCCCCC/;
|
|
615 $ss_dssp =~ tr/ I/CC/;
|
|
616 $ss_dssp =~ s/ \S / /g;
|
|
617 $ss_dssp =~ s/ \S\S / /g;
|
|
618
|
|
619 # Align query with dssp sequence
|
|
620 $aa_astr = $aas;
|
|
621 $xseq=$aas;
|
|
622 $yseq=$aa_dssp;
|
|
623 my ($imax,$imin,$jmax,$jmin);
|
|
624 my (@i,@j);
|
|
625 my $score=&AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr);
|
|
626
|
|
627 # Initialize strings (=arrays) for dssp states with "----...-"
|
|
628 my @ss_dssp_ali=(); # $ss_dssp_ali[$i] is dssp state aligned to $aa_astr[$i]
|
|
629 my @sa_dssp_ali=(); # $sa_dssp_ali[$i] is solvent accessibility string
|
|
630 my @aa_dssp_ali=(); # $aa_dssp_ali[$i] is dssp residue aligned to $aa_astr[$i]
|
|
631 for (my $i=0; $i<=length($aa_astr); $i++) { # sum up to len+1
|
|
632 # because 0'th element in @ss_dssp and @aa_dssp is dummy "-"
|
|
633 $ss_dssp_ali[$i]="-";
|
|
634 $sa_dssp_ali[$i]="-";
|
|
635 $aa_dssp_ali[$i]="-";
|
|
636 }
|
|
637
|
|
638 # To each residue (from i=0 to len-1) of input sequence $aa_astr assign aligned dssp state
|
|
639 @ss_dssp = split(//,$ss_dssp);
|
|
640 @sa_dssp = split(//,$sa_dssp);
|
|
641 @aa_dssp = split(//,$aa_dssp);
|
|
642 @aa_astr = split(//,$aa_astr);
|
|
643 my $len = 0;
|
|
644 unshift(@aa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
|
|
645 unshift(@ss_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
|
|
646 unshift(@sa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
|
|
647 unshift(@aa_astr,"-"); #add a gap symbol at beginning -> first residue is at 1!
|
|
648 for (my $col=0; $col<@i; $col++) {
|
|
649 if ($i[$col]>0) {
|
|
650 if ($j[$col]>0) {$len++;} # count match states (for score/len calculation)
|
|
651 $ss_dssp_ali[$i[$col]]=$ss_dssp[$j[$col]];
|
|
652 $sa_dssp_ali[$i[$col]]=$sa_dssp[$j[$col]];
|
|
653 $aa_dssp_ali[$i[$col]]=$aa_dssp[$j[$col]];
|
|
654 }
|
|
655 if ($v>=4) {
|
|
656 printf ("%s %3i %s %3i\n",$aa_astr[$i[$col]],$i[$col],$aa_dssp[$j[$col]],$j[$col]);
|
|
657 }
|
|
658 }
|
|
659 shift (@ss_dssp_ali); # throw out first "-"
|
|
660 shift (@sa_dssp_ali); # throw out first "-"
|
|
661 shift (@aa_dssp_ali); # throw out first "-"
|
|
662 $aa_dssp=join("",@aa_dssp_ali);
|
|
663 $ss_dssp=join("",@ss_dssp_ali);
|
|
664 $sa_dssp=join("",@sa_dssp_ali);
|
|
665
|
|
666 # Debugging output
|
|
667 if ($v>=4) {printf(STDOUT "DSSP: %s: length=%-3i score/len:%-5.3f\n",$name,$len,$score/$len);}
|
|
668 if ($v>=4) {
|
|
669 printf("IN: %s\n",$xseq);
|
|
670 printf("MATCH: %s\n",$Sstr);
|
|
671 printf("DSSP: %s\n",$yseq);
|
|
672 printf("\n");
|
|
673 printf(">ss_dssp $name\n$ss_dssp\n");
|
|
674 printf(">sa_dssp $name\n$sa_dssp\n");
|
|
675 printf(">aa_dssp $name\n$aa_dssp\n");
|
|
676 printf(">aa_astra $name\n$aa_astr\n\n");
|
|
677 }
|
|
678 if ($score/$len<0.5) {
|
|
679 printf (STDOUT "\nWARNING: in $name: alignment score with dssp residues too low: Score/len=%f.\n\n",$score/$len);
|
|
680 printf("IN: %s\n",$xseq);
|
|
681 printf("MATCH: %s\n",$Sstr);
|
|
682 printf("DSSP: %s\n",$yseq);
|
|
683 return 1;
|
|
684 }
|
|
685
|
|
686 return 0;
|
|
687 }
|
|
688
|
|
689 ################################################################################################
|
|
690 ### Return solvent accessibility code
|
|
691 ################################################################################################
|
|
692 sub sa2c ()
|
|
693 {
|
|
694 my %maxsa = (A=>106, B=>160, C=>135, D=>163, E=>194, F=>197, G=>84, H=>184, I=>169, K=>205, L=>164, M=>188,
|
|
695 N=>157, P=>136, Q=>198, R=>248, S=>130, T=>142, V=>142, W=>227, X=>180, Y=>222, Z=>196); # maximum solvent accessiblity
|
|
696 if ($_[1]=~/[a-z]/) {return "F";} # disulphide bridge
|
|
697 if (!defined $maxsa{$_[1]}) {return "-";} # no amino acid
|
|
698 my $rsa=$_[0]/$maxsa{$_[1]};
|
|
699 # printf("aa=%s sa=%5.1f max_sa=%5.1f rsa=%5.3f\n",$_[1],$_[0],$maxsa{$_[1]},$rsa);
|
|
700 if ($rsa<=0.02) {return "A";}
|
|
701 elsif ($rsa<=0.14) {return "B";}
|
|
702 elsif ($rsa<=0.33) {return "C";}
|
|
703 elsif ($rsa<=0.55) {return "D";}
|
|
704 else {return "E";}
|
|
705 }
|
|
706
|
|
707 # Find the pdb file with $pdbcode in pdb directory
|
|
708 sub OpenPDBfile() {
|
|
709
|
|
710 my $pdbcode=lc($_[0]);
|
|
711 if (! -e "$pdbdir") {
|
|
712 if ($v>=3) {print(STDERR "Warning in $program: pdb directory '$pdbdir' does not exist!\n");}
|
|
713 return 1;
|
|
714 }
|
|
715 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";}
|
|
716 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";}
|
|
717 else {$pdbfile="$pdbdir/";}
|
|
718 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";}
|
|
719 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";}
|
|
720 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";}
|
|
721 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";}
|
|
722 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";}
|
|
723 else {
|
|
724 if ($v>=3) {printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");}
|
|
725 return "";
|
|
726 }
|
|
727 if (!open (PDBFILE, "$pdbfile")) {
|
|
728 if ($v>=3) {printf(STDERR "Error in $program: Cannot open pdb file: $!\n");}
|
|
729 return "";
|
|
730 }
|
|
731 return $pdbfile;
|
|
732 }
|