Mercurial > repos > hammock > hammock
comparison external_tools/linux/lib/hh/scripts/addss.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:b7652b7c97bd | 6:2277dd59b9f9 |
---|---|
1 #!/usr/bin/env perl | |
2 # | |
3 # addss.pl | |
4 # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file. | |
5 # Output format is A3M (for input alignments) or HMMER (see User Guide). | |
6 | |
7 | |
8 # HHsuite version 2.0.16 (January 2013) | |
9 # | |
10 # Reference: | |
11 # Remmert M., Biegert A., Hauser A., and Soding J. | |
12 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
13 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
14 | |
15 # (C) Johannes Soeding and Michael Remmert, 2012 | |
16 | |
17 # This program is free software: you can redistribute it and/or modify | |
18 # it under the terms of the GNU General Public License as published by | |
19 # the Free Software Foundation, either version 3 of the License, or | |
20 # (at your option) any later version. | |
21 | |
22 # This program is distributed in the hope that it will be useful, | |
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 # GNU General Public License for more details. | |
26 | |
27 # You should have received a copy of the GNU General Public License | |
28 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
29 | |
30 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
31 | |
32 use lib $ENV{"HHLIB"}."/scripts"; | |
33 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
34 use Align; # Needleman-Wunsch and Smith-Waterman alignment functions | |
35 use File::Temp qw/ tempfile tempdir /; | |
36 use strict; | |
37 | |
38 my $ss_cit="PSIPRED: Jones DT. (1999) Protein secondary structure prediction based on position-specific scoring matrices. JMB 292:195-202."; | |
39 | |
40 | |
41 # Module needed for aligning DSSP-sequence | |
42 | |
43 $|= 1; # Activate autoflushing on STDOUT | |
44 | |
45 # Default values: | |
46 our $v=2; # verbose mode | |
47 | |
48 my $numres=0; # number of residues per line for secondary structure | |
49 my $informat="a3m"; # input format | |
50 my $neff = 7; # use alignment with this diversity for PSIPRED prediction | |
51 my $program=$0; # name of perl script | |
52 my $pdbfile; | |
53 | |
54 my $help=" | |
55 addss.pl from HHsuite $VERSION | |
56 Add PSIPRED secondary structure prediction (and DSSP annotation) to a multiple sequence alignment (MSA) | |
57 or HMMER (multi-)model file. | |
58 | |
59 If the input file is an MSA, the predicted secondary structure and confidence values are added as | |
60 special annotation sequences with names >ss_pred, >ss_conf, and >ss_dssp to the top of the output | |
61 A3M alignment. If no output file is given, the output file will have the same name as the input file, | |
62 except for the extension being replaced by '.a3m'. Allowed input formats are A3M (default), | |
63 A2M/FASTA (-fas, -a2m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm). | |
64 | |
65 If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary | |
66 structure and confidence values are added to each model. In this case the output file name is | |
67 obligatory and must be different from the input file name. | |
68 | |
69 Usage: perl addss.pl <ali_file> [<outfile>] [-fas|-a3m|-clu|-sto] | |
70 or perl addss.pl <hhm_file> <outfile> -hmm | |
71 \n"; | |
72 | |
73 # Variable declarations | |
74 my $line; | |
75 my @seqs; # sequences from infile (except >aa_ and >ss_pred sequences) | |
76 my $query_length; | |
77 my $header; # header of MSA: everything before first '>' | |
78 my $name; # query in fasta format: '>$name [^\n]*\n$qseq\n' | |
79 my $qseq; # residues of query sequence | |
80 my $infile; | |
81 my $outfile; | |
82 my $ss_pred=""; # psipred ss states | |
83 my $ss_conf=""; # psipred confidence values | |
84 my $ss_dssp; # dssp states as string | |
85 my $sa_dssp; # relative solvent accessibility from dssp as string {A,B,C,D,E} A:absolutely buried, B:buried, E:exposed | |
86 my $aa_dssp; # residues from dssp file as string | |
87 my $aa_astr; # residues from infile as string | |
88 my $q_match; # number of match states in query sequence | |
89 my $xseq; # sequence x returned from Align.pm | |
90 my $yseq; # sequence y returned from Align.pm | |
91 my $Sstr; # match sequence returned from Align.pm | |
92 | |
93 ############################################################################################### | |
94 # Processing command line input | |
95 ############################################################################################### | |
96 | |
97 if (@ARGV<1) {die ($help);} | |
98 | |
99 my $options=""; | |
100 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} | |
101 | |
102 #Input format fasta? | |
103 if ($options=~s/ -fas\s/ /g) {$informat="fas";} | |
104 elsif ($options=~s/ -a2m\s/ /g) {$informat="a2m";} | |
105 elsif ($options=~s/ -a3m\s/ /g) {$informat="a3m";} | |
106 elsif ($options=~s/ -clu\s/ /g) {$informat="clu";} | |
107 elsif ($options=~s/ -sto\s/ /g) {$informat="sto";} | |
108 elsif ($options=~s/ -hmm\s/ /g) {$informat="hmm";} | |
109 | |
110 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;} | |
111 | |
112 # Set input and output file | |
113 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;} | |
114 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;} | |
115 if ($options=~s/^\s*([^-]\S*) //) {$infile=$1;} | |
116 if ($options=~s/^\s*([^-]\S*) //) {$outfile=$1;} | |
117 | |
118 # Warn if unknown options found or no infile/outfile | |
119 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");} | |
120 if (!$infile) {print($help); exit(1);} | |
121 | |
122 my $v2 = $v-1; | |
123 if ($v2>2) {$v2--;} | |
124 if ($v2<0) {$v2=0;} | |
125 | |
126 if ($informat eq "hmm" && !$outfile) { | |
127 print("Error: no output file given. With the -hmm option an output file is obligatory\n"); exit(1); | |
128 } | |
129 | |
130 ############################################################################################### | |
131 # Reformat input alignment to a3m and psiblast-readable format and generate file with query sequence | |
132 ############################################################################################### | |
133 | |
134 my $inbase; # $inbasename of infile: remove extension | |
135 my $inroot; # $inbasename of infile: remove path and extension | |
136 if ($infile=~/(.*)\..*/) {$inbase=$1;} else {$inbase=$infile;} # remove extension | |
137 if ($inbase=~/.*\/(.*)/) {$inroot=$1;} else {$inroot=$inbase;} # remove path | |
138 | |
139 # Create tmpfile | |
140 my $tmpdir; | |
141 if ($v<=3) {$tmpdir = tempdir( CLEANUP => 1);} else {$tmpdir = tempdir( CLEANUP => 0);} | |
142 my ($tmpf, $tmpfile) = tempfile( DIR => $tmpdir ); | |
143 my $tmpfile_no_dir; | |
144 if ($tmpfile=~/.*\/(.*)/) {$tmpfile_no_dir=$1;} else {$tmpfile_no_dir=$tmpfile;} # remove path | |
145 | |
146 | |
147 | |
148 ############################################################################################ | |
149 | |
150 if ($informat ne "hmm") { | |
151 if (!$outfile) {$outfile="$inbase.a3m";} | |
152 | |
153 # Use first sequence to define match states and reformat input file to a3m and psi | |
154 if ($informat ne "a3m") { | |
155 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -M first $informat a3m $infile $tmpfile.in.a3m"); | |
156 } else { | |
157 &HHPaths::System("cp $infile $tmpfile.in.a3m"); | |
158 } | |
159 | |
160 # Read query sequence | |
161 open (INFILE, "<$tmpfile.in.a3m") or die ("ERROR: cannot open $tmpfile.in.a3m!\n"); | |
162 $/=">"; # set input field separator | |
163 my $i=0; | |
164 $qseq=""; | |
165 $header = <INFILE>; | |
166 $header =~s />$//; | |
167 while ($line=<INFILE>) { | |
168 $line=~s/>$//; | |
169 if ($line=~/^ss_/ || $line=~/^aa_/) {next;} | |
170 $seqs[$i++]=">$line"; | |
171 if(!$qseq) { | |
172 $line=~s/^(.*)[^\n]*//; | |
173 $name=$1; | |
174 $qseq=$line; | |
175 $qseq=~s/\n//g; | |
176 } | |
177 } | |
178 close(INFILE); | |
179 $/="\n"; # set input field separator | |
180 | |
181 if ($qseq =~ /\-/) { | |
182 | |
183 # First sequence contains gaps => calculate consensus sequence | |
184 &HHPaths::System("hhconsensus -i $tmpfile.in.a3m -s $tmpfile.sq -o $tmpfile.in.a3m > /dev/null"); | |
185 | |
186 } else { | |
187 | |
188 $query_length = ($qseq=~tr/A-Z/A-Z/); | |
189 $qseq=~tr/A-Z//cd; # remove everything except capital letters | |
190 | |
191 # Write query sequence file in FASTA format | |
192 open (QFILE, ">$tmpfile.sq") or die("ERROR: can't open $tmpfile.sq: $!\n"); | |
193 printf(QFILE ">%s\n%s\n",$name,$qseq); | |
194 close (QFILE); | |
195 } | |
196 | |
197 # Filter alignment to diversity $neff | |
198 if ($v>=1) {printf ("Filtering alignment to diversity $neff ...\n");} | |
199 &HHPaths::System("hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m"); | |
200 | |
201 # Reformat into PSI-BLAST readable file for jumpstarting | |
202 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -r -noss a3m psi $tmpfile.in.a3m $tmpfile.in.psi"); | |
203 | |
204 open (ALIFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n"); | |
205 printf (ALIFILE "%s",$header); | |
206 | |
207 # Add DSSP sequence (if available) | |
208 if ($dssp ne "") { | |
209 if (!&AppendDsspSequences("$tmpfile.sq")) { | |
210 if ($numres) { | |
211 $ss_dssp=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
212 } | |
213 printf (ALIFILE ">ss_dssp\n%s\n",$ss_dssp); | |
214 if ($v>=1) {print("\nAdding DSSP state sequence ...\n");} | |
215 } | |
216 } | |
217 | |
218 # Secondary structure prediction with psipred | |
219 if ($v>=2) {print("Predicting secondary structure with PSIPRED ... ");} | |
220 &RunPsipred("$tmpfile.sq"); | |
221 | |
222 if (open (PSIPREDFILE, "<$tmpfile.horiz")) { | |
223 $ss_conf=""; | |
224 $ss_pred=""; | |
225 # Read Psipred file | |
226 while ($line=<PSIPREDFILE>) { | |
227 if ($line=~/^Conf:\s+(\S+)/) {$ss_conf.=$1;} | |
228 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;} | |
229 } | |
230 close(PSIPREDFILE); | |
231 $ss_conf=~tr/0-9/0/c; # replace all non-numerical symbols with a 0 | |
232 if ($numres) { | |
233 $ss_pred=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
234 $ss_conf=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues | |
235 } | |
236 printf(ALIFILE ">ss_pred PSIPRED predicted secondary structure\n%s\n",$ss_pred); | |
237 printf(ALIFILE ">ss_conf PSIPRED confidence values\n%s\n",$ss_conf); | |
238 } | |
239 | |
240 # Append alignment sequences to psipred sequences | |
241 for ($i=0; $i<@seqs; $i++) { | |
242 printf(ALIFILE "%s",$seqs[$i]); | |
243 } | |
244 close(ALIFILE); | |
245 if ($v>=2) {print("done \n");} | |
246 } | |
247 ############################################################## | |
248 # HMMER format | |
249 else | |
250 { | |
251 if (!$outfile) {$outfile="$inbase.hmm";} | |
252 | |
253 my $log2 = log(2); | |
254 my @logoddsmat; | |
255 my @lines; | |
256 my $length; | |
257 my $query; | |
258 my $scale=0.13; # empirically determined scale factor between HMMER bit score and PSI-BLAST score, 0.3 for HMMER3 | |
259 my $acc; | |
260 my $name; | |
261 my $desc; | |
262 my $nmodels=0; | |
263 | |
264 open (INFILE, "<$infile") || die("ERROR: cannot open $infile: $!\n"); | |
265 open (OUTFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n"); | |
266 | |
267 # Read HMMER file model by model | |
268 while ($line=<INFILE>) { | |
269 # Search for start of next model | |
270 while ($line && $line!~/^HMMER/ && $line!~/^NAME /) { | |
271 $line=<INFILE>; | |
272 } | |
273 if ($line=~/^HMMER3/) { | |
274 | |
275 $scale = 0.3; | |
276 @logoddsmat=(); | |
277 @lines=($line); | |
278 | |
279 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}} | |
280 $line=~/^LENG\s+(\d+)/; | |
281 $length=$1; # number of match states in HMM | |
282 $query=""; # query residues from NULL emission lines | |
283 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}} | |
284 push(@lines,$line=<INFILE>); | |
285 if ($line !~ /^\s*COMPO/) { | |
286 die("Error: need null-model probablities (Parameter COMPO)!\n"); | |
287 } | |
288 $line=~s/^\s*COMPO\s+(\S.*\S)\s*$/$1/; | |
289 my @nullmodel = split(/\s+/,$line); | |
290 @nullmodel = map {$_ = exp(-1 * $_)} @nullmodel; # Transform to probabilities | |
291 push(@lines,$line=<INFILE>); # state 0 insert emission | |
292 push(@lines,$line=<INFILE>); # transisitions from begin state | |
293 | |
294 while ($line=<INFILE>) { | |
295 push(@lines,$line); | |
296 if ($line=~/^\/\//) {last;} | |
297 $line=~s/^\s*\d+\s+(\S.*\S)\s+\d+\s+(\S)\s+\S\s*$/$1/; | |
298 $query .= $2; | |
299 my @probs = split(/\s+/,$line); | |
300 @probs = map {$_ = exp(-1 * $_)} @probs; # Transform to probabilities | |
301 # calculate log-odds | |
302 my @logodds = (); | |
303 for (my $a = 0; $a < scalar(@probs); $a++) { | |
304 my $logodd = (log($probs[$a] / $nullmodel[$a]) / $log2) * 1000; | |
305 push(@logodds, $logodd); | |
306 } | |
307 | |
308 push(@logoddsmat,\@logodds); | |
309 push(@lines,$line=<INFILE>); | |
310 push(@lines,$line=<INFILE>); | |
311 } | |
312 | |
313 } else { | |
314 | |
315 $scale=0.13; | |
316 if ($line!~/^HMMER/ && $line!~/^NAME /) {last;} # first line in each model must begin with 'HMMER...' | |
317 @logoddsmat=(); | |
318 @lines=($line); | |
319 | |
320 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}} | |
321 $line=~/^LENG\s+(\d+)/; | |
322 $length=$1; # number of match states in HMM | |
323 $query=""; # query residues from NULL emission lines | |
324 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}} | |
325 push(@lines,$line=<INFILE>); | |
326 while ($line=<INFILE>) { | |
327 push(@lines,$line); | |
328 if ($line=~/^\/\//) {last;} | |
329 $line=~s/^\s*\d+\s+(\S.*\S)\s*$/$1/; | |
330 my @logodds = split(/\s+/,$line); | |
331 push(@logoddsmat,\@logodds); | |
332 push(@lines,$line=<INFILE>); | |
333 $line=~/^\s*(\S)/; | |
334 $query .= $1; | |
335 push(@lines,$line=<INFILE>); | |
336 } | |
337 } | |
338 | |
339 # Write mtx matrix | |
340 open (MTXFILE, ">$tmpfile.mtx") || die("ERROR: cannot open $tmpfile.mtx: $!\n"); | |
341 printf(MTXFILE "%i\n",$length); | |
342 printf(MTXFILE "%s\n",$query); | |
343 printf(MTXFILE "2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n"); | |
344 while (@logoddsmat) { | |
345 my @logodds = @{shift(@logoddsmat)}; | |
346 print(MTXFILE "-32768 "); | |
347 splice(@logodds, 1,0,-32768/$scale); # insert logodds value for B | |
348 splice(@logodds,20,0, -100/$scale); # insert logodds value for X | |
349 splice(@logodds,22,0,-32768/$scale); # insert logodds value for Z | |
350 for (my $i=0; $i<23; $i++) { | |
351 printf(MTXFILE "%4.0f ",$scale*$logodds[$i]); | |
352 } | |
353 print(MTXFILE "-32768 -400\n"); | |
354 } | |
355 close(MTXFILE); | |
356 | |
357 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
358 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0 | |
359 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss"); | |
360 } else { | |
361 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss"); | |
362 } | |
363 | |
364 # READ PSIPRED file | |
365 if (open (PSIPRED, "$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss |")) { | |
366 $ss_conf=""; | |
367 $ss_pred=""; | |
368 # Read Psipred file | |
369 while ($line=<PSIPRED>) { | |
370 if ($line=~/^Conf:\s+(\d+)/) {$ss_conf.=$1;} | |
371 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;} | |
372 } | |
373 close(PSIPREDFILE); | |
374 } | |
375 | |
376 # Add secondary structure to HMMER output file and print | |
377 foreach $line (@lines) { | |
378 if ($line=~/^SSPRD/ || $line=~/^SSCON/|| $line=~/^SSCIT/) {next;} | |
379 if ($line=~/^HMM /) { | |
380 $ss_pred=~s/(\S{80})/$1\nSSPRD /g; # insert a line break every 80 residues | |
381 $ss_conf=~s/(\S{80})/$1\nSSCON /g; # insert a line break every 80 residues | |
382 printf(OUTFILE "SSCIT HHsearch-readable PSIPRED secondary structure prediction:\n"); | |
383 printf(OUTFILE "SSPRD %s\n",$ss_pred); | |
384 printf(OUTFILE "SSCON %s\n",$ss_conf); | |
385 printf(OUTFILE "SSCIT %s\n",$ss_cit); | |
386 } | |
387 printf(OUTFILE $line); | |
388 } | |
389 $nmodels++; | |
390 } | |
391 | |
392 close(OUTFILE); | |
393 close(INFILE); | |
394 &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2"); | |
395 if ($v>=2) {printf("Added PSIPRED secondary structure to %i models\n",$nmodels);} | |
396 } | |
397 | |
398 if ($v<=3) { | |
399 unlink("$tmpfile.in.a3m"); | |
400 unlink("$tmpfile.in.psi"); | |
401 unlink("$tmpfile.horiz"); | |
402 unlink("$tmpfile.dssp"); | |
403 } | |
404 | |
405 exit; | |
406 | |
407 ############################################################################################## | |
408 # Run SS prediction starting from alignment in $tmpfile.in.psi (called by BuildAlignment) | |
409 ############################################################################################## | |
410 sub RunPsipred() { | |
411 # This is a simple script which will carry out all of the basic steps | |
412 # required to make a PSIPRED V2 prediction. Note that it assumes that the | |
413 # following programs are in the appropriate directories: | |
414 # blastpgp - PSIBLAST executable (from NCBI toolkit) | |
415 # makemat - IMPALA utility (from NCBI toolkit) | |
416 # psipred - PSIPRED V2 program | |
417 # psipass2 - PSIPRED V2 program | |
418 | |
419 my $infile=$_[0]; | |
420 my $basename; #file name without extension | |
421 my $rootname; #basename without directory path | |
422 if ($infile =~/^(.*)\..*?$/) {$basename=$1;} else {$basename=$infile;} | |
423 if ($basename=~/^.*\/(.*?)$/) {$rootname=$1;} else {$rootname=$basename;} | |
424 | |
425 # Does dummy database exist? | |
426 if (!-e "$dummydb.phr") { | |
427 if (!-e "$dummydb") {die "Error in addss.pl: Could not find $dummydb\n";} | |
428 | |
429 &HHPaths::System("cp $infile $dummydb"); | |
430 &HHPaths::System("$ncbidir/formatdb -i $dummydb"); | |
431 if (!-e "$dummydb.phr") {die "Error in addss.pl: Could not find nor create index files for $dummydb\n";} | |
432 } | |
433 | |
434 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
435 &HHPaths::System("$ncbidir/blastpgp -b 1 -j 1 -h 0.001 -d $dummydb -i $infile -B $tmpfile.in.psi -C $tmpfile.chk 1> $tmpfile.blalog 2> $tmpfile.blalog"); | |
436 | |
437 #print("Predicting secondary structure...\n"); | |
438 | |
439 &HHPaths::System("echo "."$tmpfile_no_dir".".chk > $tmpfile.pn\n"); | |
440 &HHPaths::System("echo "."$tmpfile_no_dir".".sq > $tmpfile.sn\n"); | |
441 &HHPaths::System("$ncbidir/makemat -P $tmpfile"); | |
442 | |
443 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile | |
444 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0 | |
445 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss"); | |
446 } else { | |
447 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss"); | |
448 } | |
449 | |
450 &HHPaths::System("$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss > $tmpfile.horiz"); | |
451 | |
452 # Remove temporary files | |
453 if ($v<=3) { unlink(split ' ', "$tmpfile.pn $tmpfile.sn $tmpfile.mn $tmpfile.chk $tmpfile.blalog $tmpfile.mtx $tmpfile.aux $tmpfile.ss $tmpfile.ss2 $tmpfile.sq");} | |
454 return; | |
455 } | |
456 | |
457 ############################################################################################## | |
458 # Read query sequence and extract dssp sequence | |
459 ############################################################################################## | |
460 sub AppendDsspSequences() { | |
461 my $qfile=$_[0]; | |
462 | |
463 my $line; #input line | |
464 my $name; #name of sequence in in file, e.g. d1g8ma1 | |
465 my $qrange; #chain and residue range of query sequence | |
466 my $aas=""; #amino acids from in file for each $name | |
467 | |
468 my $dsspfile; | |
469 my $pdbfile; | |
470 my $pdbcode; #pdb code for accessing dssp file; shortened from in code, e.g. 1g8m | |
471 my @ss_dssp=(); #dssp states for residues (H,E,L) | |
472 my @sa_dssp=(); #dssp states for residues (H,E,L) | |
473 my @aa_dssp=(); #residues in dssp file | |
474 my @aa_astr=(); #residues from infile | |
475 my $length; #length of sequence | |
476 | |
477 # Default parameters for Align.pm | |
478 our $d=3; # gap opening penatlty for Align.pm | |
479 our $e=0.1; # gap extension penatlty for Align.pm | |
480 our $g=0.09; # endgap penatlty for Align.pm | |
481 our $matrix="identity"; | |
482 | |
483 # Read query sequence -> $name, $nameline, $range, $aas | |
484 open (QFILE, "<$qfile") || die ("cannot open $qfile: $!"); | |
485 while ($line=<QFILE>) { | |
486 if ($line=~/>(\S+)/) { | |
487 $name=$1; | |
488 | |
489 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1) | |
490 if ($line=~/^>[defgh](\d[a-z0-9]{3})[a-z0-9_.][a-z0-9_]\s+[a-z]\.\d+\.\d+\.\d+\s+\((\S+)\)/) { | |
491 $pdbcode=$1; | |
492 $qrange=$2; | |
493 } | |
494 | |
495 # PDB ID? (8fab_A, 1a0i) | |
496 elsif ($line=~/^>(\d[a-z0-9]{3})_?(\S?)\s/) { | |
497 $pdbcode=$1; | |
498 if ($2 ne "") {$qrange="$2:";} else {$qrange="-";} | |
499 } | |
500 | |
501 # DALI ID? (8fabA_0,1a0i_2) | |
502 elsif ($line=~/^>(\d[a-z0-9]{3})[A-Za-z0-9]?_\d+\s+\d+\.\d+.\d+.\d+.\d+.\d+\s+\((\S+)\)/) { | |
503 $pdbcode=$1; | |
504 $qrange=$2; | |
505 } | |
506 | |
507 else { | |
508 if ($v>=3) {print("Warning: no pdb code found in sequence name '$name'\n");} | |
509 close(QFILE); | |
510 return 1; # no astral/DALI/pdb sequence => no dssp states available | |
511 } | |
512 $aas=""; | |
513 | |
514 } | |
515 else | |
516 { | |
517 chomp($line); | |
518 $line=~tr/a-z \t/A-Z/d; | |
519 $aas.=$line; | |
520 } | |
521 } | |
522 close(QFILE); | |
523 if ($v>=3) {printf("Searching DSSP state assignments: name=%s range=%s\n",$name,$qrange);} | |
524 | |
525 # Try to open dssp file | |
526 $dsspfile="$dsspdir/$pdbcode.dssp"; | |
527 if (! open (DSSPFILE, "<$dsspfile")) { | |
528 if ($v>=3) {printf(STDERR "Warning in $program: Cannot open $dsspfile!\n");} | |
529 $pdbfile = &OpenPDBfile($pdbcode); | |
530 if ($pdbfile eq "") {return;} | |
531 | |
532 system("$dssp $pdbfile $tmpfile.dssp 2> /dev/null"); | |
533 system("cp $tmpfile.dssp $dsspfile 2> /dev/null"); | |
534 $dsspfile="$tmpfile.dssp"; | |
535 if (! open (DSSPFILE, "<$dsspfile")) { | |
536 if ($v>=3) {printf(STDERR "Warning in $program: dssp couldn't generate file from $pdbfile. Skipping $name\n");} | |
537 return 1; | |
538 } | |
539 } | |
540 | |
541 #....+....1....+....2....+....3....+....4 | |
542 # # RESIDUE AA STRUCTURE BP1 BP2 ACC etc. | |
543 # 623 630 A R < 0 0 280 etc. | |
544 # 624 !* 0 0 0 etc. | |
545 # 625 8 B A 0 0 105 etc. | |
546 # 626 9 B P >> - 0 0 71 etc. | |
547 # 292 28SA K H 4 S+ 0 0 71 etc. (1qdm.dssp) | |
548 # 293 29SA K H > S+ 0 0 28 etc. | |
549 | |
550 # Read in whole DSSP file | |
551 for (my $try = 1; $try<=2; $try++) { | |
552 $aa_dssp=""; | |
553 $sa_dssp=""; | |
554 $ss_dssp=""; | |
555 while ($line=<DSSPFILE>) {if ($line=~/^\s*\#\s*RESIDUE\s+AA/) {last;}} | |
556 while ($line=<DSSPFILE>) | |
557 { | |
558 if ($line=~/^.{5}(.{5})(.)(.)\s(.).\s(.).{18}(...)/) | |
559 { | |
560 my $thisres=$1; | |
561 my $icode=$2; | |
562 my $chain=$3; | |
563 my $aa=$4; | |
564 my $ss=$5; | |
565 my $sa=$6; | |
566 my $contained=0; | |
567 my $range=$qrange; | |
568 if ($aa eq "!") {next;} # missing residues! | |
569 $thisres=~tr/ //d; | |
570 $chain=~tr/ //d; | |
571 $icode=~tr/ //d; | |
572 $sa=~tr/ //d; | |
573 if ($try==1) { | |
574 do{ | |
575 if ($range=~s/^(\S):(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq $1 && $icode eq $4 && $2<=$thisres && $thisres<=$3) { | |
576 $contained=1; #syntax (A:56S-135S) | |
577 } | |
578 elsif ($range=~s/^(\S):(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq $1 && $2<=$thisres && $thisres<=$3) { | |
579 $contained=1; #syntax (R:56-135) | |
580 } | |
581 elsif ($range=~s/^(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq "" && $icode eq $3 && $1<=$thisres && $thisres<=$2) { | |
582 $contained=1; #syntax (56-135) | |
583 } | |
584 elsif ($range=~s/^(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq "" && $1<=$thisres && $thisres<=$2) { | |
585 $contained=1; #syntax (56-135) | |
586 } | |
587 elsif ($range=~s/^(\S):// && $chain eq $1) { | |
588 $contained=1; #syntax (A:) or (A:,2:) | |
589 } | |
590 elsif ($range=~s/^-$// && $chain eq "") { | |
591 $contained=1; #syntax (-) | |
592 } | |
593 $range=~s/^,//; | |
594 # print("qrange=$qrange range='$range' ires=$thisres chain=$chain contained=$contained\n"); | |
595 } while($contained==0 && $range ne ""); | |
596 if ($contained==0) {next;} | |
597 } # end if try==1 | |
598 $aa_dssp.=$aa; | |
599 $ss_dssp.=$ss; | |
600 $sa_dssp.=&sa2c($sa,$aa); | |
601 } | |
602 } | |
603 # if not enough residues were found: chain id is wrong => repeat extraction without checking chain id | |
604 if (length($aa_dssp)>=10) {last;} | |
605 close(DSSPFILE); | |
606 open (DSSPFILE, "<$dsspfile"); | |
607 } | |
608 close(DSSPFILE); | |
609 | |
610 if (length($aa_dssp)==0) {print("WARNING: no residues found in $dsspdir/$pdbcode.dssp\n"); return 1;} | |
611 if (length($aa_dssp)<=20) {printf("WARNING: only %i residues found in $dsspdir/$pdbcode.dssp\n",length($aa_dssp)); return 1;} | |
612 | |
613 # Postprocess $aa_dssp etc | |
614 $aa_dssp =~ tr/a-z/CCCCCCCCCCCCCCCCCCCCCCCCCC/; | |
615 $ss_dssp =~ tr/ I/CC/; | |
616 $ss_dssp =~ s/ \S / /g; | |
617 $ss_dssp =~ s/ \S\S / /g; | |
618 | |
619 # Align query with dssp sequence | |
620 $aa_astr = $aas; | |
621 $xseq=$aas; | |
622 $yseq=$aa_dssp; | |
623 my ($imax,$imin,$jmax,$jmin); | |
624 my (@i,@j); | |
625 my $score=&AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr); | |
626 | |
627 # Initialize strings (=arrays) for dssp states with "----...-" | |
628 my @ss_dssp_ali=(); # $ss_dssp_ali[$i] is dssp state aligned to $aa_astr[$i] | |
629 my @sa_dssp_ali=(); # $sa_dssp_ali[$i] is solvent accessibility string | |
630 my @aa_dssp_ali=(); # $aa_dssp_ali[$i] is dssp residue aligned to $aa_astr[$i] | |
631 for (my $i=0; $i<=length($aa_astr); $i++) { # sum up to len+1 | |
632 # because 0'th element in @ss_dssp and @aa_dssp is dummy "-" | |
633 $ss_dssp_ali[$i]="-"; | |
634 $sa_dssp_ali[$i]="-"; | |
635 $aa_dssp_ali[$i]="-"; | |
636 } | |
637 | |
638 # To each residue (from i=0 to len-1) of input sequence $aa_astr assign aligned dssp state | |
639 @ss_dssp = split(//,$ss_dssp); | |
640 @sa_dssp = split(//,$sa_dssp); | |
641 @aa_dssp = split(//,$aa_dssp); | |
642 @aa_astr = split(//,$aa_astr); | |
643 my $len = 0; | |
644 unshift(@aa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
645 unshift(@ss_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
646 unshift(@sa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
647 unshift(@aa_astr,"-"); #add a gap symbol at beginning -> first residue is at 1! | |
648 for (my $col=0; $col<@i; $col++) { | |
649 if ($i[$col]>0) { | |
650 if ($j[$col]>0) {$len++;} # count match states (for score/len calculation) | |
651 $ss_dssp_ali[$i[$col]]=$ss_dssp[$j[$col]]; | |
652 $sa_dssp_ali[$i[$col]]=$sa_dssp[$j[$col]]; | |
653 $aa_dssp_ali[$i[$col]]=$aa_dssp[$j[$col]]; | |
654 } | |
655 if ($v>=4) { | |
656 printf ("%s %3i %s %3i\n",$aa_astr[$i[$col]],$i[$col],$aa_dssp[$j[$col]],$j[$col]); | |
657 } | |
658 } | |
659 shift (@ss_dssp_ali); # throw out first "-" | |
660 shift (@sa_dssp_ali); # throw out first "-" | |
661 shift (@aa_dssp_ali); # throw out first "-" | |
662 $aa_dssp=join("",@aa_dssp_ali); | |
663 $ss_dssp=join("",@ss_dssp_ali); | |
664 $sa_dssp=join("",@sa_dssp_ali); | |
665 | |
666 # Debugging output | |
667 if ($v>=4) {printf(STDOUT "DSSP: %s: length=%-3i score/len:%-5.3f\n",$name,$len,$score/$len);} | |
668 if ($v>=4) { | |
669 printf("IN: %s\n",$xseq); | |
670 printf("MATCH: %s\n",$Sstr); | |
671 printf("DSSP: %s\n",$yseq); | |
672 printf("\n"); | |
673 printf(">ss_dssp $name\n$ss_dssp\n"); | |
674 printf(">sa_dssp $name\n$sa_dssp\n"); | |
675 printf(">aa_dssp $name\n$aa_dssp\n"); | |
676 printf(">aa_astra $name\n$aa_astr\n\n"); | |
677 } | |
678 if ($score/$len<0.5) { | |
679 printf (STDOUT "\nWARNING: in $name: alignment score with dssp residues too low: Score/len=%f.\n\n",$score/$len); | |
680 printf("IN: %s\n",$xseq); | |
681 printf("MATCH: %s\n",$Sstr); | |
682 printf("DSSP: %s\n",$yseq); | |
683 return 1; | |
684 } | |
685 | |
686 return 0; | |
687 } | |
688 | |
689 ################################################################################################ | |
690 ### Return solvent accessibility code | |
691 ################################################################################################ | |
692 sub sa2c () | |
693 { | |
694 my %maxsa = (A=>106, B=>160, C=>135, D=>163, E=>194, F=>197, G=>84, H=>184, I=>169, K=>205, L=>164, M=>188, | |
695 N=>157, P=>136, Q=>198, R=>248, S=>130, T=>142, V=>142, W=>227, X=>180, Y=>222, Z=>196); # maximum solvent accessiblity | |
696 if ($_[1]=~/[a-z]/) {return "F";} # disulphide bridge | |
697 if (!defined $maxsa{$_[1]}) {return "-";} # no amino acid | |
698 my $rsa=$_[0]/$maxsa{$_[1]}; | |
699 # printf("aa=%s sa=%5.1f max_sa=%5.1f rsa=%5.3f\n",$_[1],$_[0],$maxsa{$_[1]},$rsa); | |
700 if ($rsa<=0.02) {return "A";} | |
701 elsif ($rsa<=0.14) {return "B";} | |
702 elsif ($rsa<=0.33) {return "C";} | |
703 elsif ($rsa<=0.55) {return "D";} | |
704 else {return "E";} | |
705 } | |
706 | |
707 # Find the pdb file with $pdbcode in pdb directory | |
708 sub OpenPDBfile() { | |
709 | |
710 my $pdbcode=lc($_[0]); | |
711 if (! -e "$pdbdir") { | |
712 if ($v>=3) {print(STDERR "Warning in $program: pdb directory '$pdbdir' does not exist!\n");} | |
713 return 1; | |
714 } | |
715 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";} | |
716 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";} | |
717 else {$pdbfile="$pdbdir/";} | |
718 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";} | |
719 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";} | |
720 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";} | |
721 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";} | |
722 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";} | |
723 else { | |
724 if ($v>=3) {printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");} | |
725 return ""; | |
726 } | |
727 if (!open (PDBFILE, "$pdbfile")) { | |
728 if ($v>=3) {printf(STDERR "Error in $program: Cannot open pdb file: $!\n");} | |
729 return ""; | |
730 } | |
731 return $pdbfile; | |
732 } |