comparison external_tools/darwin/lib/hh/scripts/hhmakemodel.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
comparison
equal deleted inserted replaced
5:b7652b7c97bd 6:2277dd59b9f9
1 #! /usr/bin/env perl
2 #
3 # hhmakemodel.pl
4 # Generate a model from an output alignment of hhsearch.
5 # Usage: hhmakemodel.pl -i file.out (-ts file.pdb|-al file.al) [-m int|-m name|-m auto] [-pdb pdbdir]
6
7 # (C) Johannes Soeding 2012
8
9 # HHsuite version 2.0.16 (January 2013)
10 #
11 # Reference:
12 # Remmert M., Biegert A., Hauser A., and Soding J.
13 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
14 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
15
16 # This program is free software: you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation, either version 3 of the License, or
19 # (at your option) any later version.
20
21 # This program is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details.
25
26 # You should have received a copy of the GNU General Public License
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28
29 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
30
31 use lib $ENV{"HHLIB"}."/scripts";
32 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
33 use strict;
34 use Align;
35
36
37 $|=1; # force flush after each print
38
39 # Default parameters
40 our $d=7; # gap opening penalty for Align.pm; more than 2 mismatches - 2 matches ## previously: 1
41 our $e=0.01; # gap extension penatlty for Align.pm; allow to leave large gaps bridging uncrystallized regions ## previously: 0.1
42 our $g=0.1; # endgap penalty for Align.pm; allow to shift SEQRES residues for uncrystallized aas to ends of alignment ## previously: 0.9
43 my $v=2; # 3: DEBUG
44
45 my $formatting="CASP"; # CASP or LIVEBENCH
46 my $servername="My server"; #
47 my $MINRES=30; # minumum number of new query residues required for a hit to be used as additional parent
48 my $infile="";
49 my $outfile="";
50 my $outformat="fas";
51 my $pickhits="1 "; # default: build one model from best hit
52 my $Pthr=0;
53 my $Ethr=0;
54 my $Prob=0;
55 my $shift=0; # ATTENTION: set to 0 as default!
56 my $NLEN=14; # length of the name field in alignments of hhsearch-output
57 my $NUMRES=100; # number of residues per line in FASTA, A2M, PIR format
58 my $program=$0; # name of perl script
59 my $usage="
60 hhmakemodel.pl from HHsuite $VERSION
61 From the top hits in an hhsearch output file (hhr), you can
62 * generate a MSA (multiple sequence alignment) containing all representative
63 template sequences from all selected alignments (options -fas, -a2m, -a3m, -pir)
64 * generate several concatenated pairwise alignments in AL format (option -al)
65 * generate several concatenated coarse 3D models in PDB format (option -ts)
66 In PIR, PDB and AL format, the pdb files are required in order to read the pdb residue numbers
67 and ATOM records.
68 The PIR formatted file can be used directly as input to the MODELLER homology modelling package.
69 Usage: $program [-i] file.hhr [options]
70
71 Options:
72 -i <file.hhr> results file from hhsearch with hit list and alignments
73 -fas <file.fas> write a FASTA-formatted multiple alignment to file.fas
74 -a2m <file.a2m> write an A2M-formatted multiple alignment to file.a2m
75 -a3m <file.a3m> write an A3M-formatted multiple alignment to file.a3m
76 -m <int> [<int> ...] pick hits with specified indices (default='-m 1')
77 -p <probability> minimum probability threshold (default=$Pthr)
78 -e <E-value> maximum E-value threshold (default=$Ethr)
79 -q <query_ali> use the full-length query sequence in the alignment
80 (not only the aligned part);
81 the query alignment file must be in HHM, FASTA, A2M,
82 or A3M format.
83 -N use query name from hhr filename (default: use same
84 name as in hhr file)
85 -first include only first Q or T sequence of each hit in MSA
86 -v verbose mode (default=$v)
87
88 Options when database matches in hhr file are PDB or SCOP sequences
89 -pir <file.pir> write a PIR-formatted multiple alignment to file.pir
90 -ts <file.pdb> write the PDB-formatted models based on *pairwise*
91 alignments into file.pdb
92 -al <file.al> write the AL-formatted *pairwise* alignments into file.al
93 -d <pdbdirs> directories containing the pdb files (for PDB, SCOP, or DALI
94 sequences) (default=$pdbdir)
95 -s <int> shift the residue indices up/down by an integer (default=$shift);
96 -CASP formatting for CASP (for -ts, -al options) (default: LIVEBENCH
97 formatting)
98 Options when query is compared to itself (for repeat detection)
99 -conj include also conjugate alignments in MSA (with query and
100 template exchanged)
101 -conjs include conjugate alignments and sort by ascending diagonal
102 value (i.e. i0-j0)
103 \n";
104
105 # Options to help extract repeats from self-alignments:
106 # 1. default 2. -conj 3. -conj_diag 4. -conj_compact
107 # ABCD ABCD ---A ABCD
108 # BCD- BCD- --AB BCDA
109 # D--- CD-- -ABC CDAB
110 # CD-- D--- ABCD DABC
111 # ---A BCD-
112 # --AB CD--
113 # -ABC D---
114
115
116 # Variable declarations
117 my $line; # input line
118 my $score=-1; # score of the best model; at the moment: Probability
119 my $qname=""; # name of query from hhsearch output file (infile)
120 my $tname=""; # name of template (hit) from hhsearch output file (infile)
121 my $qnameline=""; # nameline of query
122 my $tnameline; # nameline of template
123 my $pdbfile; # name of pdbfile to read
124 my $pdbcode; # four-letter pdb code in lower case and _A if chain A (e.g. 1h7w_A)
125 my $aaq; # query amino acids from hhsearch output
126 my @aaq; # query amino acids from hhsearch output
127 my @qname; # query names in present alignment as returned from ReadAlignment()
128 my @qfirst; # indices of first residues in present alignmet as returned from ReadAlignment()
129 my @qlast; # indices of last residues in present alignmet as returned from ReadAlignment()
130 my @qseq; # sequences of querys in present alignment as returned from ReadAlignment()
131 my @tname; # template names in present alignment as returned from ReadAlignment()
132 my @tfirst; # indices of first residues in present alignmet as returned from ReadAlignment()
133 my @tlast; # indices of last residues in present alignmet as returned from ReadAlignment()
134 my @tseq; # sequences of templates in present alignment as returned from ReadAlignment()
135 my $aat; # template amino acids from hhsearch output
136 my @aat; # template amino acids from hhsearch output
137 my $aapdb; # template amino acids from pdb file
138 my @aapdb; # template amino acids from pdb file
139 my $qfirst=0; # first residue of query
140 my $qlast=0; # last residue of query
141 my $qlength; # length of query sequence
142 my $tfirst=0; # first residue of template
143 my $tlast=0; # first residue of template
144 my $tlength; # length of template sequence
145 my $l=1; # counts template residues from pdb file (first=1, like for i[col2] and j[col2]
146 my $col1=0; # counts columns from hhsearch alignment
147 my $col2=0; # counts columns from alignment (by function &AlignNW) of $aat versus $aapdb
148 my @i1; # $i1[$col1] = index of query residue in column $col1 of hhsearch-alignment
149 my @j1; # $j1[$col1] = index of template residue in column $col1 of hhsearch-alignment
150 my @j2; # $j2[$col2] = index of hhsearch template seq in $col2 of alignment against pdb template sequence
151 my @l2; # $l2[$col2] = index of pdb template seq in $col2 of alignment against hhsearch template sequence
152 my @l1; # $l1[$col1] = $l2[$col2]
153 my $res; # residue name
154 my $chain; # pdb chain from template name
155 my $qfile; # name of query sequence file (for -q option)
156 my $qmatch; # number of match states in alignment
157 my $hit; # index of hit in hit list
158 my $k; # index of hit sorted by position in alignment with query (k=1,...,k=@first-2)
159 my %picked=(); # $picked{$hit} is defined and =$k for hits that will be transformed into model
160 my @remarks;
161 my @printblock; # block 0: header block k: k'th hit
162 my $keyword=""; # either METHOD for CASP format or REMARK for LIVEBENCH format
163 my $conj=0; # include conjugate sequences? Sort in which way?
164 my $conjugate=0; # when query is compared to itself: do not include conjugate alignments
165 my $onlyfirst=0; # include only first representative sequence of each Q/T alignment
166 my $dummy; # dummy
167 my $addchain=1; # 1: PDB files contain chain-id as in 1abc_A.pdb (not 1abc.pdb or pdb1abc.pdb etc.)
168 my $pdbdirs=$pdbdir; # default pdb directory with *.pdb files
169 my $options="";
170
171 # Processing command line options
172 if (@ARGV<1) {die $usage;}
173 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
174
175
176 # Set options
177 if ($options=~s/ -i\s+(\S+) / /g) {$infile=$1;}
178 if ($options=~s/ -q\s+(\S+) / /g) {$qfile=$1;}
179 if ($options=~s/ -ts\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";}
180 if ($options=~s/ -pdb\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";}
181 if ($options=~s/ -al\s+(\S+) / /ig) {$outfile=$1; $outformat="AL";}
182 if ($options=~s/ -pir\s+(\S+) / /ig) {$outfile=$1; $outformat="PIR";}
183 if ($options=~s/ -fas\s+(\S+) / /ig) {$outfile=$1; $outformat="FASTA";}
184 if ($options=~s/ -a2m\s+(\S+) / /ig) {$outfile=$1; $outformat="A2M";}
185 if ($options=~s/ -a3m\s+(\S+) / /ig) {$outfile=$1; $outformat="A3M";}
186 if ($options=~s/ -p\s+(\S+) / /g) {$Pthr=$1;}
187 if ($options=~s/ -e\s+(\S+) / /g) {$Ethr=$1;}
188 if ($options=~s/ -s\s+(\S+) / /g) {$shift=$1;}
189 if ($options=~s/ -d\s+(([^-\s]\S*\s+)*)/ /g) {$pdbdirs=$1;}
190 if ($options=~s/ -m\s+((\d+\s+)+)/ /g) {$pickhits=$1; }
191 if ($options=~s/ -first\s+/ /ig) {$onlyfirst=1;}
192
193 # Self-alignment options
194 if ($options=~s/ -conj\s+/ /ig) {$conj=1;}
195 if ($options=~s/ -conjs\s+/ /ig) {$conj=2;}
196
197 # Switch formatting and method description
198 if ($options=~s/ -CASP\s+/ /ig) {$formatting="CASP";}
199 if ($options=~s/ -LIVEBENCH\s+/ /ig) {$formatting="LIVEBENCH";}
200 if ($options=~s/ -server\s+(\S+)/ /g) {$servername=$1;}
201
202 # Set verbose mode?
203 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;}
204 elsif ($options=~s/ -v\s+/ /g) {$v=1;}
205
206 # Read infile and outfile
207 if (!$infile && $options=~s/^\s*([^-]\S+)\s*/ /) {$infile=$1;}
208 if (!$outfile && $options=~s/^\s*([^-]\S+)\s*/ /) {$outfile=$1;}
209 if ($options=~s/ -N / /ig) {
210 $qname=$infile;
211 $qname=~s/^.*?([^\/]+)$/$1/; # remove path
212 $qname=~s/^(.*)\.[^\.]*$/$1/; # remove extension
213 $qnameline=$qname;
214 }
215
216 # Warn if unknown options found or no infile/outfile
217 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
218 if ($infile eq "") {die("$usage\nError in $program: input file missing: $!\n");}
219 if ($outfile eq "") {die("$usage\nError in $program: output file missing: $!\n");}
220
221 my @pdbdirs = split(/\s+/,$pdbdirs);
222
223 # Find query name in input file
224 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n";
225 while ($line=<INFILE>) {
226 if ($v>=3) {print("$line");}
227 if ($qname eq "" && $line=~/^Query:?\s*(\S+)(.*)/) {$qname=$1; $qnameline=$1.$2;}
228 if ($line=~/^Match_columns:?\s*(\S+)/) {$qmatch=$1; last;}
229 }
230 if (!($line=<INFILE>)) {die ("Error in $program: wrong format in $infile: $!\n");}
231
232
233 # Prepare hash %pick with indices of hits that will be transformed into model
234 # No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM
235 # 1 153l Lysozyme (E.C.3.2.1.17) 100.0 0 0 381.0 19.4 185 1-185 1-185 (185)
236 # 2 1qsa_A Soluble lytic transglyc 100.0 2.1E-39 2.5E-43 225.8 8.3 149 21-182 423-600 (618)
237 # 3 1ltm 36 kDa soluble lytic tr 95.9 3.5E-06 4.1E-10 50.3 11.0 95 28-122 76-221 (320)
238
239 # option '-m m1 m2 m3': pick models manually
240 my @pickhits = split(/\s+/,$pickhits);
241 $k=1;
242 foreach $hit (@pickhits) {
243 if (!defined $picked{$hit}) {$picked{$hit}=$k;}
244 $k++;
245 }
246
247 if ($outformat eq "AL" || $outformat eq "TS") {
248 &MakePairwiseAlignments();
249 } else {
250 &MakeMultipleAlignment();
251 }
252 exit;
253
254
255 ##################################################################################
256 # Construct AL or TS formatted alignment as a list of pairwise alignments
257 ##################################################################################
258 sub MakePairwiseAlignments()
259 {
260 # Scan through query-vs-template-alignments from infile and create first (combination) model
261 $hit=0; # counts hits in hit list
262 my $models=0;
263 while ($line=<INFILE>) {
264 if ($line=~/^>(\S+)/) {
265 $hit++;
266 if ($Pthr || $Ethr || defined $picked{$hit}) {
267 # Found right alignment (hit)
268 if (defined $picked{$hit}) {$k=$picked{$hit};} else {$k=$hit;}
269
270 if ($line=~/^>(.*?)\s+E=.*$/) {
271 $line=$1; # remove E=1.3E-30 etc. at the end
272 } else {
273 $line=~/^>(.*)/;
274 $line=$1;
275 }
276 my $nameline=$line;
277 my $evalue;
278 $line=<INFILE>;
279 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) {$score=$1; $evalue=$2}
280 else {$score=0; warn("WARNING: could not print score $line");}
281 if ($line=~/Aligned_cols=\s*(\S+)/) {;} else {warn("WARNING: could not find aligned_cols\n");}
282 if ($Pthr && $score<$Pthr) {last;} # Probability too low -> finished
283 if ($Ethr && $evalue>$Ethr) {last;} # Evalue too high > finished
284
285
286 # Commented out in CASP format
287 if ($formatting eq "LIVEBENCH") {
288 $printblock[$k] ="PFRMAT $outformat\n";
289 $printblock[$k].="TARGET $qname\n";
290 }
291
292 $remarks[$k]="REMARK $k: $nameline\n";
293 $remarks[$k].="REMARK $line";
294
295 &ReadAlignment();
296 $qfirst = $qfirst[0];
297 $qlast = $qlast[0];
298 $aaq = $qseq[0];
299 $tfirst = $tfirst[0];
300 $aat = $tseq[0];
301 $tname = $tname[0];
302
303 if ($v>=3) {
304 for (my $i=0; $i<@qfirst; $i++) {
305 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
306 }
307 printf("\n");
308 for (my $i=0; $i<@tfirst; $i++) {
309 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
310 }
311 printf("\n");
312 }
313
314 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
315 if (&ExtractPdbcodeAndChain($tname[0])) {next;}
316 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";}
317
318 # Read score (=probability)
319 $printblock[$k].="REMARK $nameline\n";
320 $printblock[$k].="REMARK $line";
321 $printblock[$k].="SCORE $score\n";
322 $printblock[$k].="PARENT $pdbcode\n";
323 $printblock[$k].="MODEL $k\n";
324
325 &WritePairwiseAlignments();
326 $printblock[$k].="END\n";
327 $models++;
328
329 }
330 }
331 }
332 $k=$#printblock; # set $k to last index in @printblock
333 if ($k<0) {
334 $printblock[1]="PARENT NONE\nTER\n";
335 $printblock[1].="END\n";
336 if ($v>=1) {print("WARNING: no hits found for model!\n");}
337 }
338 close (INFILE);
339
340 if ($v>=2) {
341 printf("$models models built\n");
342 }
343
344
345
346 # Write model file header
347
348 #---+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8
349
350 # Print header
351 my $date = scalar(localtime);
352 if ($formatting eq "CASP") {
353 $printblock[0]="PFRMAT $outformat\n";
354 $printblock[0].="TARGET $qname\n";
355 }
356 $printblock[0].="REMARK AUTHOR $servername\n";
357 $printblock[0].="REMARK $date\n";
358 # $printblock[0].="REMARK J. Soeding \n";
359
360 # Add remarks
361 for ($k=0; $k<@remarks; $k++) {
362 if (defined $remarks[$k]) {
363 $printblock[0].=$remarks[$k];
364 }
365 }
366 $printblock[0].="REMARK \n";
367
368
369 # Print @printblock into outfile
370
371 open (OUTFILE, ">$outfile") || die "Error in $program: Couldn't open $outfile: $!\n";
372 foreach my $printstr (@printblock) {
373 my @printarr=split(/\n/,$printstr);
374 if ($outformat eq "TS") {
375 foreach $printstr (@printarr) {
376 printf(OUTFILE "%-80.80s\n",$printstr);
377 }
378 } else {
379 foreach $printstr (@printarr) {
380 printf(OUTFILE "%s\n",$printstr);
381 }
382 }
383 }
384 close (OUTFILE);
385
386 if ($outformat eq "TS") {
387 # Call MaxSprout to generate sidechains
388 }
389 return;
390 }
391
392
393 ##################################################################################
394 # Construct multiple alignment in FASTA, A2M, or PIR format
395 ##################################################################################
396 sub MakeMultipleAlignment()
397 {
398 my @hitnames=(); # $hitnames[$k] is the nameline of the ihit'th hit
399 my @hitseqs=(); # $hitseqs[$k] contains the residues of the ihit'th hit
400 my @hitdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0]
401 my @conjnames=(); # $hitnames[$k] is the nameline of the ihit'th conjugate hit
402 my @conjseqs=(); # $hitseqs[$k] contains the residues of the ihit'th conjugate hit
403 my @conjdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0] for conjugate alignments
404
405 my $new_hit; # residues of new hit
406 my $i; # residue index
407 my $j; # residue index
408 my $k; # sequence index
409
410 $hitnames[0]="";
411 $hitseqs[0]="";
412 $hitdiag[0]=0;
413 $conjnames[0]="";
414 $conjseqs[0]="";
415 $conjdiag[0]=0;
416
417 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n";
418 $hit=0; # counts hits in hit list
419
420 # Read one alignment after the other
421 while ($line=<INFILE>) {
422
423 # Found new aligment
424 if ($line=~/^>(\S+)/) {
425 $hit++;
426
427 # Is alignment selected by user?
428 if ($Pthr || $Ethr || defined $picked{$hit}) {
429
430 if ($line=~/^>(\S+)(.*)/) {$tname=$1; $tnameline=$1.$2;}
431 else {die("\nError: bad format in $infile, line $.: code 1\n");}
432
433 $line = <INFILE>;
434 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) {
435 if ($Pthr && $1<$Pthr) {last;} # Probability too low -> finished
436 if ($Ethr && $2>$Ethr) {last;} # Evalue too high > finished
437 } else { die("\nError: bad format in $infile, line $.: code 2\n"); }
438
439 # Read next alignment with $aaq, $qfirst, @tseq, @first, and @tname
440 &ReadAlignment();
441 chomp($tnameline);
442 if ($tnameline=~/\S+\s+(.*)/) {$tname[0].=" $1";} # template seed sequence gets its description
443
444 # Format sequences into @hitseqs and @hitnames
445 &FormatSequences(\@hitnames,\@hitseqs,\@hitdiag,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength);
446
447 # Use conjugate alignments?
448 if ($conj>0) {
449 &FormatSequences(\@conjnames,\@conjseqs,\@conjdiag,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength);
450 }
451
452 } # end: if ($Pthr>0 || defined $picked{$hit})
453
454 } # end: if ($line=~/^>(\S+)/) # found new alignment
455
456 } # end while
457 close (INFILE);
458
459 # Insert full-length query sequence?
460 if ($qfile) {
461 $hitseqs[0]="";
462 open (QFILE, "<$qfile") || die "Error in $program: Couldn't open $qfile: $!\n";
463 while ($line=<QFILE>) {
464 if ($line=~/^>/ && $line!~/^>ss_/ && $line!~/^>sa_/ && $line!~/^>aa_/ && $line!~/^>Consensus/) {last;}
465 }
466 while ($line=<QFILE>) {
467 if ($line=~/^>/ || $line=~/^\#/) {last;}
468 $line=~tr/\n\.-//d;
469 $line=~tr/a-z/A-Z/;
470 $hitseqs[0].=$line;
471 }
472 close(QFILE);
473 if ($v>=2) {printf("\nQ(full) %-14.14s %s\n",$qname,$hitseqs[0]);}
474 }
475
476
477 # DEBUG
478 if ($v>=3) {
479 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]);
480 for ($k=1; $k<@hitnames; $k++) {
481 printf("T hit %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
482 }
483 printf("\n");
484 printf("\nQuery %-14.14s %s\n",$qname,$conjseqs[0]);
485 for ($k=1; $k<@conjnames; $k++) {
486 printf("T conj %3i %-14.14s %s\n",$k,$conjnames[$k],$conjseqs[$k]);
487 }
488 printf("\n");
489 }
490
491
492 # Include conjugate sequences?
493 if ($conj>0) {
494 shift(@conjseqs); # delete zeroth ("query") sequence of @conjseqs
495 shift(@conjnames); #
496 shift(@conjdiag); #
497
498 # Sort by diagonals $hitdiag[], $conjdiag[]
499 &Sort(\@hitdiag,\@hitseqs,\@hitnames);
500 &Sort(\@conjdiag,\@conjseqs,\@conjnames);
501
502 # Append conjugate sequences to hitseqs
503 splice(@hitseqs,scalar(@hitseqs),0,@conjseqs);
504 splice(@hitnames,scalar(@hitnames),0,@conjnames);
505
506 if ($v>=3) {
507 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]);
508 for ($k=1; $k<@hitnames; $k++) {
509 chomp($hitnames[$k]);
510 printf("T tot %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
511 $hitnames[$k].="\n";
512 }
513 }
514 }
515
516 # Insert gaps:
517
518 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j.
519 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state
520 my $insert;
521 my $ngap;
522
523 # For each match state determine length of LONGEST insert after this match state and store in @len_ins
524 for ($k=0; $k<@hitnames; $k++) {
525 # split into list of single match states and variable-length inserts
526 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
527 # The '#' symbol is prepended to get rid of a perl bug in split
528 $j=0;
529 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
530 # printf("Sequence $k: $hitseqs[$k]\n");
531 # printf("Sequence $k: @inserts\n");
532 foreach $insert (@inserts) {
533 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {
534 $len_ins[$j]=length($insert);
535 }
536 $j++;
537 # printf("$insert|");
538 }
539 # printf("\n");
540 }
541
542
543
544 # After each match state insert residues and fill up with gaps to $len_ins[$i] characters
545 for ($k=0; $k<@hitnames; $k++) {
546 # split into list of single match states and variable-length inserts
547 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
548 $j=0;
549
550 # append the missing number of gaps after each match state
551 foreach $insert (@inserts) {
552 if($outformat eq "FASTA") {
553 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.="-";}
554 }
555 else {
556 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.=".";}
557 }
558 $j++;
559 }
560 $hitseqs[$k] = join("",@inserts);
561 $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
562 }
563
564 # Remove columns at beginning and end with gaps in all sequences
565 my $remove_start;
566 my $remove_end;
567 my $len;
568 $hitseqs[0]=~/^(-*)/;
569 $remove_start=length($1);
570 $hitseqs[0]=~/(-*)$/;
571 $remove_end=length($1);
572 for ($k=0; $k<@hitnames; $k++) {
573 $hitseqs[$k]=~s/^.{$remove_start}(.*).{$remove_end}/$1/;
574 }
575 $len=($hitseqs[0]=~tr/a-zA-Z/a-zA-Z/);
576
577 # Prepare name line of query
578 if ($outformat eq "PIR") {
579 my $qnametmp=$qname;
580 $qnametmp=~tr/:/;/;
581 $qnameline=~/^\S+\s*(.*)/;
582 my $qnamelinetmp=$1;
583 $qnamelinetmp=~tr/:/;/;
584 $hitnames[0] = sprintf(">P1;%s\nsequence:%s:%4i: :%4i: :%s: : 0.00: 0.00\n",$qnametmp,$qnametmp,$remove_start+1,$len+$remove_start,$qnamelinetmp);
585 } else {
586 # outformat is "FASTA" or "A2M" or "A3M" or ...
587 $hitnames[0] = ">$qnameline\n";
588 }
589
590 # If pretty diagonally sorted order is wanted...
591 if ($conj>0) {
592 if ($conj==2) {
593 my $center = 0.5*(scalar(@hitseqs)-1);
594 @conjseqs = splice(@hitseqs,$center+1,$center);
595 splice(@hitseqs,0,0,@conjseqs);
596 @hitseqs = reverse(@hitseqs);
597
598 @conjnames = splice(@hitnames,$center+1,$center);
599 splice(@hitnames,0,0,@conjnames);
600 @hitnames = reverse(@hitnames);
601 # Shorten namelines of all but first sequence
602 my %count;
603 for ($k=0; $k<@hitnames; $k++) {
604 if ($k==$center) {$k++;}
605 $hitnames[$k]=~/(\S{1,14})/;
606 if (!defined $count{$1}) {$count{$1}=0;}
607 my $count = ++$count{$1};
608 # printf("vorher: %s ",$hitnames[$k]);
609 $hitnames[$k]=~s/^(\S{1,14}).*/$1:$count/;
610 # printf("nachher: %s\n",$hitnames[$k]);
611 }
612 } else {
613 for ($k=0; $k<@hitnames; $k++) {$hitnames[$k]=">$qname\n";}
614 }
615 }
616
617 # Remove gaps? Captialize?
618 if ($outformat eq "PIR") {
619 for ($k=0; $k<@hitnames; $k++) {
620 $hitseqs[$k].="*";; # Transform to upper case
621 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case
622 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions
623 }
624 } elsif ($outformat eq "FASTA") {
625 for ($k=0; $k<@hitnames; $k++) {
626 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case
627 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions
628 }
629 } elsif ($outformat eq "A2M") {
630 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g;} # insert newlines every NUMRES positions
631 } elsif ($outformat eq "A3M") {
632 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~tr/.//d;$hitseqs[$k].="\n";} # Remove gaps aligned to inserts
633 }
634
635 # Write sequences into output file
636 open (OUTFILE, ">$outfile") || die ("cannot open $outfile:$!");
637 for ($k=0; $k<@hitnames; $k++) {
638 print(OUTFILE "$hitnames[$k]$hitseqs[$k]");
639 }
640 close OUTFILE;
641
642
643 if ($v>=2) {
644 printf("%i sequences written to $outfile\n",scalar(@hitnames));
645 }
646 }
647
648
649 # Format sequences into @hitseqs and @hitnames
650 # & Call with FormatSequences(\@hitnames,\@hitseqs,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength);
651 sub FormatSequences()
652 {
653 my $p_hitnames = $_[0]; # typeglob to $hitname
654 my $p_hitseqs = $_[1]; # ...
655 my $p_hitdiag = $_[2]; # ...
656
657 my $p_qname = $_[3]; #
658 my $p_qseq = $_[4]; #
659 my $p_qfirst = $_[5]; #
660 my $p_qlast = $_[6]; #
661 my $p_qlength = $_[7]; #
662
663 my $p_tname = $_[8]; #
664 my $p_tseq = $_[9]; #
665 my $p_tfirst = $_[10]; #
666 my $p_tlast = $_[11]; #
667 my $p_tlength = $_[12]; #
668 my $i;
669
670 if ($v>=2) {
671 if (defined $picked{$hit}) {
672 print("hit=$hit picked=$picked{$hit} tname=$tname[0]");
673 } else {
674 print("hit=$hit picked=evalue<$Ethr tname=$tname[0]");
675 }
676 for (my $i=1; $i<@{$p_tname}; $i++) {
677 print(", $tname[$i]");
678 }
679 print("\n");
680 }
681
682
683 my $qfirst = ${$p_qfirst}[0];
684 my $qlast = ${$p_qlast}[0];
685 my $qlength = ${$p_qlength};
686 my $aaq = ${$p_qseq}[0];
687
688 @aaq = unpack("C*",$aaq); # needed for transforming template sequences into a3m based on query residues (NOT HMM match states!)
689 $aaq=~tr/.-//d; # remove all gaps from query sequence
690
691 # For all template sequences in the present alignment
692 for (my $k=0; $k<@{$p_tname}; $k++) {
693
694 $tname =${$p_tname}[$k];
695 $tfirst=${$p_tfirst}[$k];
696 $aat =${$p_tseq}[$k];
697
698 # Transform template residues into a3m format:
699 # match states are those where query has residue (NOT where HMM has match state!)
700 # This makes sense since we want to build a model for the query sequence.
701 @aat = unpack("C*",$aat);
702 $aat="";
703
704 # Transform all columns with residue in query into match/delete states, all others to inserts
705 for ($i=0; $i<scalar(@aaq); $i++) {
706 if ($aaq[$i]!=45 && $aaq[$i]!=46) { # no gap in query
707 if($aat[$i]==46) {
708 $aat.="-"; # transform '.' to '-' if aligned with a query residue
709 } else {
710 $aat .= uc(chr($aat[$i])); # UPPER case if aligned with a query residue (match state)
711 }
712 } else {
713 if($aat[$i]!=45 && $aat[$i]!=46) { # no gap in template?
714 $aat.=lc(chr($aat[$i])); # lower case if aligned with a gap in the query (insert state)
715 }
716 }
717 }
718
719 if ($v>=2) {
720 printf("\nQ %-14.14s %s\n",$qname,$aaq);
721 printf("T %-14.14s %s\n",$tname,$aat);
722 }
723
724 # Outformat is PIR? => read residues and indices from PDB ATOM records
725 if ($outformat eq "PIR") {
726
727 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
728 if (&ExtractPdbcodeAndChain($tname)) {next;}
729
730 # Read sequence from pdb file
731 if (!open (PDBFILE, "$pdbfile")) {
732 die ("Error in $program: Couldn't open $pdbfile: $!\n");
733 }
734 $aapdb="";
735 $l=0;
736 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l]
737 my $nres=-1e6;
738 my $resolution=-1.00;
739 my $rvalue=-1.00;
740 while ($line=<PDBFILE>) {
741 if ($line=~/^REMARK.*RESOLUTION\.\s+(\d+\.?\d*)/) {$resolution=$1;}
742 if ($line=~/^REMARK.*R VALUE\s+\(WORKING SET\)\s+:\s+(\d+\.?\d*)/) {$rvalue=$1;}
743 if ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one
744 if (($line=~/^ATOM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ ||
745 ($line=~/^HETATM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ && &Three2OneLetter($1) ne "X") ) &&
746 $2 ne $nres ) {
747 $res=$1;
748 $nres=$2;
749 $nres[$l]=$2;
750 $res=&Three2OneLetter($res);
751 $aapdb[$l++]=$res;
752 $aapdb.=$res;
753 }
754 }
755 close (PDBFILE);
756 if (length($aapdb)<=0) {die("Error: chain $chain not found in pdb file $pdbfile\n");}
757
758 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb)
759 my $xseq=$aat;
760 $xseq=~tr/-/~/; # transform Deletes to '~' to distinguish them from gaps '-' inserted by Align.pm
761 my $yseq=$aapdb;
762 my ($jmin,$jmax,$lmin,$lmax);
763 my $Sstr;
764 my $score;
765 # The aligned characters are returend in $j2[$col2] and $l2[$col2]
766 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr);
767
768 # DEBUG
769 if ($v>=3) {
770 printf("Template (hh) $xseq\n");
771 printf("Identities $Sstr\n");
772 printf("Template (pdb) $yseq\n");
773 printf("\n");
774 if ($v>=4) {
775 for ($col2=0; $col2<@l2 && $col2<1000; $col2++) {
776 printf("%3i %3i:%s %3i:%s -> %i\n",$col2,$j2[$col2],substr($aat,$j2[$col2]-1,1),$l2[$col2],substr($aapdb,$l2[$col2]-1,1),$nres[$l2[$col2]-1]);
777 }
778 }
779 }
780
781 # check for reasonable alignment
782 my $num_match = 0;
783 for ($i=0; $i<@l2; $i++) {
784 if ($j2[$i] > 0 && $l2[$i] > 0) {
785 $num_match++;
786 }
787 }
788 if (($score/$num_match) < 1) {
789 print "WARNING! Match score with PDBfile (score: $score num: $num_match score/num:".($score/$num_match).") to low => $pdbfile not included!\n";
790 next;
791 }
792
793 # Assign a3m-formatted amino acid sequence from pdb file to $aapdb
794 $aapdb="";
795 my @xseq=unpack("C*",$xseq);
796 my @yseq=unpack("C*",$yseq);
797 for ($i=0; $i<@yseq; $i++) {
798 if(($xseq[$i]>=65 && $xseq[$i]<=90) || $xseq[$i]==ord('~')) { # if $aat has upper case residue or Delete state
799 # Match state
800 $aapdb.=uc(chr($yseq[$i]));
801 } else {
802 # Insert state
803 if ($yseq[$i]!=45) {$aapdb.=lc(chr($yseq[$i]));} # add only if not a gap '-'
804 }
805 }
806
807 # Remove overlapping ends of $aapdb
808 $aapdb=~s/^[a-z]*(.*?)[a-z]*$/$1/;
809
810 # Glue gaps at beginning and end of aligned pdb sequence and add sequence to alignment
811 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aapdb.("-" x ($qlength-$qlast)) ); # use ATOM record residues $aapdb!
812
813 # Write hitname in PIR format into @hitnames
814 my $descr;
815 my $organism;
816 my $struc=$pdbcode;
817 if ($tnameline=~/^(\S+)\s+(.*)/) {$descr=$2; $descr=~tr/://d;} else {$descr=" ";}
818 if ($tnameline=~/^(\S+)\s+.*\s+\{(.*)\}/) {$organism=$2;} else {$organism=" ";}
819 if (length($chain)>1 || $chain eq ".") { # MODELLER's special symbol for 'chain unspecified'
820 $chain=".";
821 } elsif ($addchain && $chain ne " ") {
822 $struc.="_$chain";
823 }
824 # push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s:%4i:%1s:%4i:%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$nres[$lmin-1],$chain,$nres[$lmax-1],$chain,$descr,$organism,$resolution,$rvalue) );
825
826 my $descrtmp=$descr;
827 $descrtmp=~tr/:/;/;
828 $organism=~tr/://d;
829 push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s: :%1s: :%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$chain,$chain,$descrtmp,$organism,$resolution,$rvalue) );
830 push (@{$p_hitdiag}, $tfirst-$qfirst);
831 } else {
832 # outformat is "FASTA" or "A2M" or "A3M" or ...
833 # Write hitname in FASTA format into @hitnames
834 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aat.("-" x ($qlength-$qlast)) );
835 push (@{$p_hitnames}, ">$tname\n" );
836 push (@{$p_hitdiag}, $tfirst-$qfirst);
837 }
838
839 if ($onlyfirst>0) {last;} # extract only first (seed?) sequence in each alignment
840
841 } # end: for (my $k=0; $k<@{$tname}; $k++)
842
843 # Paste aligned subsequence of query over $hitseqs[0]
844 if (${$p_hitseqs}[0] eq "") {${$p_hitseqs}[0] = "-" x $qlength;}
845 if (!$qfile) {substr(${$p_hitseqs}[0],$qfirst-1,length($aaq),$aaq);}
846
847 return;
848 }
849
850
851 ##################################################################################
852 # Read Alignment from infile (*.hhr file)
853 # Results:
854 # $aaq: query residues in present alignment
855 # $qfirst: index of first query residue in present alignment
856 # @tname: template names in present alignmen
857 # @tfirst: indices of first residues in present alignmet
858 # @tseq: sequences of templates in present alignment
859 ##################################################################################
860 sub ReadAlignment() {
861
862 @qname=(); # name of $it'th query in this alignment
863 @qfirst=(); # index of first residue in $it'th query in this alignment
864 @qlast=(); # index of last residue in $it'th query in this alignment
865 @qseq=(); # residues of $it'th query in this alignment
866
867 @tname=(); # name of $it'th template in this alignment
868 @tfirst=(); # index of first residue in $it'th template in this alignment
869 @tlast=(); # index of last residue in $it'th template in this alignment
870 @tseq=(); # residues of $it'th template in this alignment
871
872 if ($v>=3) {printf("Searching for Q $qname vs T $tname\n");}
873 $line=<INFILE>;
874
875 # Search for first line beginning with Q ot T and not followed by aa_, ss_pred, ss_conf, or Consensus
876 while (1) {
877 my $i; # index for query sequence in this alignment
878 # Scan up to first line starting with Q; stop when line 'No\s+\d+' or 'Done' is found
879 while (defined $line && $line!~/^Q\s(\S+)/) {
880 if ($line=~/^No\s+\d/ || $line=~/^Done/) {last;}
881 $line=<INFILE>; next;
882 }
883 if (!defined $line || $line=~/^No\s+\d/ || $line=~/^Done/) {last;}
884
885 # Scan up to first line that is not secondary structure line or consensus line
886 while (defined $line && $line=~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;}
887
888 # Read next block of query sequences
889 $i=0;
890 while ($line=~/^Q\s+/) {
891 if ($line!~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^Q\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/) {
892 $qname[$i]=$1;
893 if (!$qfirst[$i]) {$qfirst[$i]=$2;} # if $qfirst is undefined then this is the first alignment block -> set $qfirst to $1
894 if (!$qseq[$i]) {$qseq[$i]=$3;} else {$qseq[$i].=$3;}
895 $qlast[$i]=$4;
896 if ($i==0) {$qlength=$5}
897 $i++;
898 }
899 $line=<INFILE>;
900 }
901 if ($i==0) {
902 die("\nError in $program: bad format in $infile, line $.: query block\n");
903 }
904
905 # Scan up to first line starting with T
906 while (defined $line && $line!~/^T\s+(\S+)/) {$line=<INFILE>;}
907
908 # Scan up to first line that is not secondary structure line or consensus line
909 while (defined $line && $line=~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;}
910
911 # Read next block of template sequences
912 $i=0;
913 while ($line=~/^T\s+/) {
914 if ($line!~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^T\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/){
915 $tname[$i]=$1;
916 if (!$tfirst[$i]) {$tfirst[$i]=$2;} # if $tfirst is undefined then this is the first alignment block -> set $tfirst to $1
917 if (!$tseq[$i]) {$tseq[$i]=$3;} else {$tseq[$i].=$3;}
918 $tlast[$i]=$4;
919 if ($i==0) {$tlength=$5}
920 $i++;
921 }
922 $line=<INFILE>;
923 }
924 if ($i==0) {
925 die("\nError in $program: bad format in $infile, line $.: template block\n");
926 }
927
928 } # end while ($line=<INFILE>)
929
930 # if (!$qfirst) {$qfirst=1;} # if still $qfirst==0 then set $qfirst to 1
931 # for (my $i=0; $i<@tfirst; $i++) {
932 # if (!$tfirst[$i]) {$tfirst[$i]=1;} # if still $tfirst[$i]==0 then set $tfirst to 1
933 # }
934
935 # Check lengths
936 if (length($qseq[0])!=length($tseq[0])) {
937 print("\nError: query and template lines do not have the same length in $infile, line $.\n");
938 for (my $i=0; $i<@qfirst; $i++) {
939 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
940 }
941 printf("\n");
942 for (my $i=0; $i<@tfirst; $i++) {
943 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
944 }
945 printf("\n");
946 exit 1;
947 }
948
949 if ($v>=3) {
950 for (my $i=0; $i<@qfirst; $i++) {
951 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
952 }
953 printf("\n");
954 for (my $i=0; $i<@tfirst; $i++) {
955 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
956 }
957 printf("\n");
958 }
959 return;
960 }
961
962
963 ##################################################################################
964 # Write Alignment to $printblock[$k]
965 ##################################################################################
966 sub WritePairwiseAlignments() {
967
968 #Delete columns with gaps in both sequences
969 $aaq=uc($aaq);
970 $aat=uc($aat);
971 @aaq=split(//,$aaq);
972 @aat=split(//,$aat);
973 my $col=0;
974 for ($col1=0; $col1<@aaq; $col1++) {
975 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/ || $aat[$col1]=~tr/a-zA-Z/a-zA-Z/) {
976 $aaq[$col]=$aaq[$col1];
977 $aat[$col]=$aat[$col1];
978 $col++;
979 }
980 }
981 splice(@aaq,$col); # delete end of @aaq;
982 splice(@aat,$col);
983 $aaq=join("",@aaq);
984 $aat=join("",@aat);
985
986
987 # Count query and template residues into @i1 and @j1
988 for ($col1=0; $col1<@aaq; $col1++) {
989 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/) {
990 $i1[$col1]=$qfirst++; #found query residue in $col1
991 } else {
992 $i1[$col1]=0; #found gap in $col1
993 }
994 if ($aat[$col1]=~tr/a-zA-Z/a-zA-Z/) {
995 $j1[$col1]=$tfirst++; #found template residue in $col1
996 } else {
997 $j1[$col1]=0; #found gap in $col1
998 }
999 }
1000
1001
1002 # DEBUG
1003 if ($v>=3) {
1004 printf ("col Q i1 T j1\n");
1005 for ($col1=0; $col1<@aaq; $col1++) {
1006 printf ("%3i %s %3i %s %3i\n",$col1,$aaq[$col1],$i1[$col1],$aat[$col1],$j1[$col1]);
1007 }
1008 printf ("\n");
1009 }
1010
1011
1012 # Read protein chain from pdb file
1013 # ----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8
1014 # ATOM 1 N SER A 27 38.637 79.034 59.693 1.00 79.70 # ATOM 2083 CD1 LEU A 22S 15.343 -12.020 43.761 1.00 5.00 C
1015
1016 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
1017 if (&ExtractPdbcodeAndChain($tname)) {next;}
1018
1019 # Read sequence from pdb file
1020 if (! defined $pdbfile) {die ("Error in $program: Couldn't find pdb code in $tname\n");}
1021 open (PDBFILE, "$pdbfile") || die ("Error in $program: Couldn't open $pdbfile: $!\n");
1022 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";}
1023 $aapdb=""; $l=1;
1024 $line=<PDBFILE>;
1025 while ($line) {if ($line=~/^ATOM/) {last;} $line=<PDBFILE>;} # advance to ATOM records
1026 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l]
1027 my @coord; # $coord[$l] = coordinates of CA atom of residue $aapdb[$l]
1028 while ($line) {
1029 if ($line=~/^ATOM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ ||
1030 ($line=~/^HETATM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ && &Three2OneLetter($1) ne "X") ) {
1031 $res=$1;
1032 $nres[$l]=$2;
1033 $coord[$l]=$3." 1.00";
1034 $res=&Three2OneLetter($res);
1035 $aapdb[$l]=$res;
1036 $aapdb.=$res;
1037 $l++;
1038 }
1039 elsif ($l>10 && $line=~/^ATOM\s+\d+ CA/) {last;}
1040 elsif ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one
1041 $line=<PDBFILE>;
1042 }
1043 close (PDBFILE);
1044
1045 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb)
1046
1047 my $xseq=$aat;
1048 my $yseq=$aapdb;
1049 my ($jmin,$jmax,$lmin,$lmax);
1050 my $Sstr;
1051 my $score;
1052 $xseq=~tr/-/~/d; # transform Deletes to '~' to distinguish them from gaps inserted by Align.pm
1053 #the aligned characters are returend in $j2[$col2] and $l2[$col2]
1054
1055 if ($v>=3) {
1056 printf("Template (hh) $xseq\n");
1057 printf("Identities $Sstr\n");
1058 printf("Template (pdb) $yseq\n");
1059 printf("\n");
1060 }
1061
1062 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr);
1063
1064 # DEBUG
1065 if ($v>=3) {
1066 printf("Template (hh) $xseq\n");
1067 printf("Identities $Sstr\n");
1068 printf("Template (pdb) $yseq\n");
1069 printf("\n");
1070 if ($v>=4) {
1071 for ($col2=0; $col2<@l2 && $col2<200; $col2++) {
1072 printf("%3i %3i %3i\n",$col2,$j2[$col2],$l2[$col2]);
1073 }
1074 }
1075 }
1076
1077 # DEBUG
1078
1079 # Construct alignment of $aaq <-> $aapdb via alignments $aaq <-> $aat and $aat <-> $aapdb:
1080 # Find $l1[$col1] = line of pdb file corresponding to residue $aat[$col1] and $aaq[$col1]
1081 $col2=0;
1082 for ($col1=0; $col1<@aaq; $col1++) {
1083 if ($j1[$col1]==0 || $i1[$col1]==0) {$l1[$col1]=0; next;} # skip gaps in query and gaps in template
1084 while ($j2[$col2]<$col1+1) {$col2++;} # in $j2[col2] first index is 1, in $col1 first column is 0
1085 $l1[$col1] = $l2[$col2];
1086 if ($v>=4) {printf("l1[%i]=%i l2[%i]=%i\n",$col1,$l1[$col1],$col2,$l2[$col2]);}
1087 }
1088
1089
1090 if ($pdbcode ne "NONE") {
1091 if ($outformat eq "TS") {
1092 for ($col1=0; $col1<@aat; $col1++) {
1093 if ($i1[$col1]==0) {next;} # skip gaps in query
1094 if ($j1[$col1]==0) {next;} # skip gaps in template sequence
1095 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file
1096
1097 $printblock[$k].=sprintf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]);
1098 if ($v>=4) {
1099 printf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]);
1100 }
1101 }
1102 } else {
1103 for ($col1=0; $col1<@aat; $col1++) {
1104 if ($i1[$col1]==0) {next;} # skip gaps in query
1105 if ($j1[$col1]==0) {next;} # skip gaps in template sequence
1106 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file
1107 $printblock[$k].=sprintf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]);
1108 if ($v>=4) {printf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]);}
1109 }
1110 }
1111 }
1112 $printblock[$k].=sprintf("TER\n");
1113 return;
1114 }
1115
1116
1117
1118 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
1119 sub ExtractPdbcodeAndChain()
1120 {
1121 my $name=$_[0];
1122 $name=~/^(\S+)/;
1123 $name=$1;
1124
1125 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1)
1126 if ($name=~/^[defgh](\d[a-z0-9]{3})([a-z0-9_.])[a-z0-9_]$/) {
1127 $pdbcode=$1;
1128 if ($2 eq "_") {$chain="[A ]";} else {$chain=uc($2);}
1129 }
1130
1131 # PDB ID? (8fab, 1a0i)
1132 elsif ($name=~/^(\d[a-z0-9]{3})$/) {
1133 $pdbcode=$1;
1134 $chain="[A ]";
1135 }
1136
1137 # PDB ID? (8fab_A)
1138 elsif ($name=~/^(\d[a-z0-9]{3})_(\S)$/) {
1139 $pdbcode=$1;
1140 $chain=$2;
1141 }
1142
1143 # PDB ID? (1u1z_ABC)
1144 elsif ($name=~/^(\d[a-z0-9]{3})_(\S\S+)$/) {
1145 $pdbcode=$1;
1146 $chain="[$2]";
1147 }
1148
1149 # DALI ID? (8fabA_0,1a0i_2)
1150 elsif ($name=~/^(\d[a-z0-9]{3})([A-Za-z0-9]?)_\d+$/) {
1151 $pdbcode=$1;
1152 $chain=$2;
1153 }
1154
1155 else {
1156 $pdbcode=$name;
1157 $chain="A";
1158 # return 1; # no SCOP/DALI/pdb sequence
1159 }
1160
1161 &FindPDBfile($pdbcode);
1162
1163 if ($pdbfile eq "") {
1164 if ($v>=2) {print("Warning: no pdb file found for sequence name '$name'\n");}
1165 return 1;
1166 }
1167 return 0;
1168 }
1169
1170
1171 # Resort arrays according to sorting array0:
1172 # Resort(\@array0,\@array1,...,\@arrayN)
1173 sub Sort()
1174 {
1175 my $p_array0 = $_[0];
1176 my @index=();
1177 for (my $i=0; $i<@{$p_array0}; $i++) {$index[$i]=$i;}
1178 @index = sort { ${$p_array0}[$a] <=> ${$p_array0}[$b] } @index;
1179 foreach my $p_array (@_) {
1180 my @dummy = @{$p_array};
1181 @{$p_array}=();
1182 foreach my $i (@index) {
1183 push(@{$p_array}, $dummy[$i]);
1184 }
1185 }
1186 }
1187
1188 ##################################################################################
1189 # Convert three-letter amino acid code into one-letter code
1190 ##################################################################################
1191 sub Three2OneLetter {
1192 my $res=uc($_[0]);
1193 if ($res eq "GLY") {return "G";}
1194 elsif ($res eq "ALA") {return "A";}
1195 elsif ($res eq "VAL") {return "V";}
1196 elsif ($res eq "LEU") {return "L";}
1197 elsif ($res eq "ILE") {return "I";}
1198 elsif ($res eq "MET") {return "M";}
1199 elsif ($res eq "PHE") {return "F";}
1200 elsif ($res eq "TYR") {return "Y";}
1201 elsif ($res eq "TRP") {return "W";}
1202 elsif ($res eq "ASN") {return "N";}
1203 elsif ($res eq "ASP") {return "D";}
1204 elsif ($res eq "GLN") {return "Q";}
1205 elsif ($res eq "GLU") {return "E";}
1206 elsif ($res eq "CYS") {return "C";}
1207 elsif ($res eq "PRO") {return "P";}
1208 elsif ($res eq "SER") {return "S";}
1209 elsif ($res eq "THR") {return "T";}
1210 elsif ($res eq "LYS") {return "K";}
1211 elsif ($res eq "HIS") {return "H";}
1212 elsif ($res eq "ARG") {return "R";}
1213
1214 # The HETATM selenomethionine is read by MODELLER like a normal MET in both its HETATM_IO=off and on mode!
1215 elsif ($res eq "MSE") {return "M";} # SELENOMETHIONINE
1216 elsif ($res eq "ASX") {return "B";}
1217 elsif ($res eq "GLX") {return "Z";}
1218 else {return "X";}
1219
1220 # The following post-translationally modified residues are ignored by MODELLER in its default SET HETATM_IO=off mode
1221 # elsif ($res eq "SEC") {return "C";} # SELENOCYSTEINE
1222 # elsif ($res eq "SEP") {return "S";} # PHOSPHOSERINE
1223 # elsif ($res eq "TPO") {return "T";} # PHOSPHOTHREONINE
1224 # elsif ($res eq "TYS") {return "Y";} # SULFONATED TYROSINE
1225 # elsif ($res eq "KCX") {return "K";} # LYSINE NZ-CARBOXYLIC ACID
1226 }
1227
1228 ##################################################################################
1229 # Convert one-letter amino acid code into three-letter code
1230 ##################################################################################
1231 sub One2ThreeLetter {
1232 my $res=uc($_[0]);
1233 if ($res eq "G") {return "GLY";}
1234 elsif ($res eq "A") {return "ALA";}
1235 elsif ($res eq "V") {return "VAL";}
1236 elsif ($res eq "L") {return "LEU";}
1237 elsif ($res eq "I") {return "ILE";}
1238 elsif ($res eq "M") {return "MET";}
1239 elsif ($res eq "F") {return "PHE";}
1240 elsif ($res eq "Y") {return "TYR";}
1241 elsif ($res eq "W") {return "TRP";}
1242 elsif ($res eq "N") {return "ASN";}
1243 elsif ($res eq "D") {return "ASP";}
1244 elsif ($res eq "Q") {return "GLN";}
1245 elsif ($res eq "E") {return "GLU";}
1246 elsif ($res eq "C") {return "CYS";}
1247 elsif ($res eq "P") {return "PRO";}
1248 elsif ($res eq "S") {return "SER";}
1249 elsif ($res eq "T") {return "THR";}
1250 elsif ($res eq "K") {return "LYS";}
1251 elsif ($res eq "H") {return "HIS";}
1252 elsif ($res eq "R") {return "ARG";}
1253 elsif ($res eq "U") {return "SEC";}
1254 elsif ($res eq "B") {return "ASX";}
1255 elsif ($res eq "Z") {return "GLX";}
1256 else {return "UNK";}
1257 }
1258
1259 # Find the pdb file with $pdbcode in pdb directory
1260 sub FindPDBfile() {
1261
1262 my $pdbcode=lc($_[0]);
1263 foreach $pdbdir (@pdbdirs) {
1264 if (! -e "$pdbdir") {warn("Warning in $program: pdb directory '$pdbdir' does not exist!\n"); next;}
1265 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";}
1266 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";}
1267 else {$pdbfile="$pdbdir/";}
1268 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";}
1269 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";}
1270 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";}
1271 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";}
1272 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile.="$pdbcode.pdb";}
1273 else {next;}
1274 return $pdbfile;
1275 }
1276 printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");
1277 return "";
1278 }