6
|
1 #! /usr/bin/env perl
|
|
2 #
|
|
3 # hhmakemodel.pl
|
|
4 # Generate a model from an output alignment of hhsearch.
|
|
5 # Usage: hhmakemodel.pl -i file.out (-ts file.pdb|-al file.al) [-m int|-m name|-m auto] [-pdb pdbdir]
|
|
6
|
|
7 # (C) Johannes Soeding 2012
|
|
8
|
|
9 # HHsuite version 2.0.16 (January 2013)
|
|
10 #
|
|
11 # Reference:
|
|
12 # Remmert M., Biegert A., Hauser A., and Soding J.
|
|
13 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
|
|
14 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
|
|
15
|
|
16 # This program is free software: you can redistribute it and/or modify
|
|
17 # it under the terms of the GNU General Public License as published by
|
|
18 # the Free Software Foundation, either version 3 of the License, or
|
|
19 # (at your option) any later version.
|
|
20
|
|
21 # This program is distributed in the hope that it will be useful,
|
|
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
24 # GNU General Public License for more details.
|
|
25
|
|
26 # You should have received a copy of the GNU General Public License
|
|
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
28
|
|
29 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
|
|
30
|
|
31 use lib $ENV{"HHLIB"}."/scripts";
|
|
32 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
|
|
33 use strict;
|
|
34 use Align;
|
|
35
|
|
36
|
|
37 $|=1; # force flush after each print
|
|
38
|
|
39 # Default parameters
|
|
40 our $d=7; # gap opening penalty for Align.pm; more than 2 mismatches - 2 matches ## previously: 1
|
|
41 our $e=0.01; # gap extension penatlty for Align.pm; allow to leave large gaps bridging uncrystallized regions ## previously: 0.1
|
|
42 our $g=0.1; # endgap penalty for Align.pm; allow to shift SEQRES residues for uncrystallized aas to ends of alignment ## previously: 0.9
|
|
43 my $v=2; # 3: DEBUG
|
|
44
|
|
45 my $formatting="CASP"; # CASP or LIVEBENCH
|
|
46 my $servername="My server"; #
|
|
47 my $MINRES=30; # minumum number of new query residues required for a hit to be used as additional parent
|
|
48 my $infile="";
|
|
49 my $outfile="";
|
|
50 my $outformat="fas";
|
|
51 my $pickhits="1 "; # default: build one model from best hit
|
|
52 my $Pthr=0;
|
|
53 my $Ethr=0;
|
|
54 my $Prob=0;
|
|
55 my $shift=0; # ATTENTION: set to 0 as default!
|
|
56 my $NLEN=14; # length of the name field in alignments of hhsearch-output
|
|
57 my $NUMRES=100; # number of residues per line in FASTA, A2M, PIR format
|
|
58 my $program=$0; # name of perl script
|
|
59 my $usage="
|
|
60 hhmakemodel.pl from HHsuite $VERSION
|
|
61 From the top hits in an hhsearch output file (hhr), you can
|
|
62 * generate a MSA (multiple sequence alignment) containing all representative
|
|
63 template sequences from all selected alignments (options -fas, -a2m, -a3m, -pir)
|
|
64 * generate several concatenated pairwise alignments in AL format (option -al)
|
|
65 * generate several concatenated coarse 3D models in PDB format (option -ts)
|
|
66 In PIR, PDB and AL format, the pdb files are required in order to read the pdb residue numbers
|
|
67 and ATOM records.
|
|
68 The PIR formatted file can be used directly as input to the MODELLER homology modelling package.
|
|
69 Usage: $program [-i] file.hhr [options]
|
|
70
|
|
71 Options:
|
|
72 -i <file.hhr> results file from hhsearch with hit list and alignments
|
|
73 -fas <file.fas> write a FASTA-formatted multiple alignment to file.fas
|
|
74 -a2m <file.a2m> write an A2M-formatted multiple alignment to file.a2m
|
|
75 -a3m <file.a3m> write an A3M-formatted multiple alignment to file.a3m
|
|
76 -m <int> [<int> ...] pick hits with specified indices (default='-m 1')
|
|
77 -p <probability> minimum probability threshold (default=$Pthr)
|
|
78 -e <E-value> maximum E-value threshold (default=$Ethr)
|
|
79 -q <query_ali> use the full-length query sequence in the alignment
|
|
80 (not only the aligned part);
|
|
81 the query alignment file must be in HHM, FASTA, A2M,
|
|
82 or A3M format.
|
|
83 -N use query name from hhr filename (default: use same
|
|
84 name as in hhr file)
|
|
85 -first include only first Q or T sequence of each hit in MSA
|
|
86 -v verbose mode (default=$v)
|
|
87
|
|
88 Options when database matches in hhr file are PDB or SCOP sequences
|
|
89 -pir <file.pir> write a PIR-formatted multiple alignment to file.pir
|
|
90 -ts <file.pdb> write the PDB-formatted models based on *pairwise*
|
|
91 alignments into file.pdb
|
|
92 -al <file.al> write the AL-formatted *pairwise* alignments into file.al
|
|
93 -d <pdbdirs> directories containing the pdb files (for PDB, SCOP, or DALI
|
|
94 sequences) (default=$pdbdir)
|
|
95 -s <int> shift the residue indices up/down by an integer (default=$shift);
|
|
96 -CASP formatting for CASP (for -ts, -al options) (default: LIVEBENCH
|
|
97 formatting)
|
|
98 Options when query is compared to itself (for repeat detection)
|
|
99 -conj include also conjugate alignments in MSA (with query and
|
|
100 template exchanged)
|
|
101 -conjs include conjugate alignments and sort by ascending diagonal
|
|
102 value (i.e. i0-j0)
|
|
103 \n";
|
|
104
|
|
105 # Options to help extract repeats from self-alignments:
|
|
106 # 1. default 2. -conj 3. -conj_diag 4. -conj_compact
|
|
107 # ABCD ABCD ---A ABCD
|
|
108 # BCD- BCD- --AB BCDA
|
|
109 # D--- CD-- -ABC CDAB
|
|
110 # CD-- D--- ABCD DABC
|
|
111 # ---A BCD-
|
|
112 # --AB CD--
|
|
113 # -ABC D---
|
|
114
|
|
115
|
|
116 # Variable declarations
|
|
117 my $line; # input line
|
|
118 my $score=-1; # score of the best model; at the moment: Probability
|
|
119 my $qname=""; # name of query from hhsearch output file (infile)
|
|
120 my $tname=""; # name of template (hit) from hhsearch output file (infile)
|
|
121 my $qnameline=""; # nameline of query
|
|
122 my $tnameline; # nameline of template
|
|
123 my $pdbfile; # name of pdbfile to read
|
|
124 my $pdbcode; # four-letter pdb code in lower case and _A if chain A (e.g. 1h7w_A)
|
|
125 my $aaq; # query amino acids from hhsearch output
|
|
126 my @aaq; # query amino acids from hhsearch output
|
|
127 my @qname; # query names in present alignment as returned from ReadAlignment()
|
|
128 my @qfirst; # indices of first residues in present alignmet as returned from ReadAlignment()
|
|
129 my @qlast; # indices of last residues in present alignmet as returned from ReadAlignment()
|
|
130 my @qseq; # sequences of querys in present alignment as returned from ReadAlignment()
|
|
131 my @tname; # template names in present alignment as returned from ReadAlignment()
|
|
132 my @tfirst; # indices of first residues in present alignmet as returned from ReadAlignment()
|
|
133 my @tlast; # indices of last residues in present alignmet as returned from ReadAlignment()
|
|
134 my @tseq; # sequences of templates in present alignment as returned from ReadAlignment()
|
|
135 my $aat; # template amino acids from hhsearch output
|
|
136 my @aat; # template amino acids from hhsearch output
|
|
137 my $aapdb; # template amino acids from pdb file
|
|
138 my @aapdb; # template amino acids from pdb file
|
|
139 my $qfirst=0; # first residue of query
|
|
140 my $qlast=0; # last residue of query
|
|
141 my $qlength; # length of query sequence
|
|
142 my $tfirst=0; # first residue of template
|
|
143 my $tlast=0; # first residue of template
|
|
144 my $tlength; # length of template sequence
|
|
145 my $l=1; # counts template residues from pdb file (first=1, like for i[col2] and j[col2]
|
|
146 my $col1=0; # counts columns from hhsearch alignment
|
|
147 my $col2=0; # counts columns from alignment (by function &AlignNW) of $aat versus $aapdb
|
|
148 my @i1; # $i1[$col1] = index of query residue in column $col1 of hhsearch-alignment
|
|
149 my @j1; # $j1[$col1] = index of template residue in column $col1 of hhsearch-alignment
|
|
150 my @j2; # $j2[$col2] = index of hhsearch template seq in $col2 of alignment against pdb template sequence
|
|
151 my @l2; # $l2[$col2] = index of pdb template seq in $col2 of alignment against hhsearch template sequence
|
|
152 my @l1; # $l1[$col1] = $l2[$col2]
|
|
153 my $res; # residue name
|
|
154 my $chain; # pdb chain from template name
|
|
155 my $qfile; # name of query sequence file (for -q option)
|
|
156 my $qmatch; # number of match states in alignment
|
|
157 my $hit; # index of hit in hit list
|
|
158 my $k; # index of hit sorted by position in alignment with query (k=1,...,k=@first-2)
|
|
159 my %picked=(); # $picked{$hit} is defined and =$k for hits that will be transformed into model
|
|
160 my @remarks;
|
|
161 my @printblock; # block 0: header block k: k'th hit
|
|
162 my $keyword=""; # either METHOD for CASP format or REMARK for LIVEBENCH format
|
|
163 my $conj=0; # include conjugate sequences? Sort in which way?
|
|
164 my $conjugate=0; # when query is compared to itself: do not include conjugate alignments
|
|
165 my $onlyfirst=0; # include only first representative sequence of each Q/T alignment
|
|
166 my $dummy; # dummy
|
|
167 my $addchain=1; # 1: PDB files contain chain-id as in 1abc_A.pdb (not 1abc.pdb or pdb1abc.pdb etc.)
|
|
168 my $pdbdirs=$pdbdir; # default pdb directory with *.pdb files
|
|
169 my $options="";
|
|
170
|
|
171 # Processing command line options
|
|
172 if (@ARGV<1) {die $usage;}
|
|
173 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
|
|
174
|
|
175
|
|
176 # Set options
|
|
177 if ($options=~s/ -i\s+(\S+) / /g) {$infile=$1;}
|
|
178 if ($options=~s/ -q\s+(\S+) / /g) {$qfile=$1;}
|
|
179 if ($options=~s/ -ts\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";}
|
|
180 if ($options=~s/ -pdb\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";}
|
|
181 if ($options=~s/ -al\s+(\S+) / /ig) {$outfile=$1; $outformat="AL";}
|
|
182 if ($options=~s/ -pir\s+(\S+) / /ig) {$outfile=$1; $outformat="PIR";}
|
|
183 if ($options=~s/ -fas\s+(\S+) / /ig) {$outfile=$1; $outformat="FASTA";}
|
|
184 if ($options=~s/ -a2m\s+(\S+) / /ig) {$outfile=$1; $outformat="A2M";}
|
|
185 if ($options=~s/ -a3m\s+(\S+) / /ig) {$outfile=$1; $outformat="A3M";}
|
|
186 if ($options=~s/ -p\s+(\S+) / /g) {$Pthr=$1;}
|
|
187 if ($options=~s/ -e\s+(\S+) / /g) {$Ethr=$1;}
|
|
188 if ($options=~s/ -s\s+(\S+) / /g) {$shift=$1;}
|
|
189 if ($options=~s/ -d\s+(([^-\s]\S*\s+)*)/ /g) {$pdbdirs=$1;}
|
|
190 if ($options=~s/ -m\s+((\d+\s+)+)/ /g) {$pickhits=$1; }
|
|
191 if ($options=~s/ -first\s+/ /ig) {$onlyfirst=1;}
|
|
192
|
|
193 # Self-alignment options
|
|
194 if ($options=~s/ -conj\s+/ /ig) {$conj=1;}
|
|
195 if ($options=~s/ -conjs\s+/ /ig) {$conj=2;}
|
|
196
|
|
197 # Switch formatting and method description
|
|
198 if ($options=~s/ -CASP\s+/ /ig) {$formatting="CASP";}
|
|
199 if ($options=~s/ -LIVEBENCH\s+/ /ig) {$formatting="LIVEBENCH";}
|
|
200 if ($options=~s/ -server\s+(\S+)/ /g) {$servername=$1;}
|
|
201
|
|
202 # Set verbose mode?
|
|
203 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;}
|
|
204 elsif ($options=~s/ -v\s+/ /g) {$v=1;}
|
|
205
|
|
206 # Read infile and outfile
|
|
207 if (!$infile && $options=~s/^\s*([^-]\S+)\s*/ /) {$infile=$1;}
|
|
208 if (!$outfile && $options=~s/^\s*([^-]\S+)\s*/ /) {$outfile=$1;}
|
|
209 if ($options=~s/ -N / /ig) {
|
|
210 $qname=$infile;
|
|
211 $qname=~s/^.*?([^\/]+)$/$1/; # remove path
|
|
212 $qname=~s/^(.*)\.[^\.]*$/$1/; # remove extension
|
|
213 $qnameline=$qname;
|
|
214 }
|
|
215
|
|
216 # Warn if unknown options found or no infile/outfile
|
|
217 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
|
|
218 if ($infile eq "") {die("$usage\nError in $program: input file missing: $!\n");}
|
|
219 if ($outfile eq "") {die("$usage\nError in $program: output file missing: $!\n");}
|
|
220
|
|
221 my @pdbdirs = split(/\s+/,$pdbdirs);
|
|
222
|
|
223 # Find query name in input file
|
|
224 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n";
|
|
225 while ($line=<INFILE>) {
|
|
226 if ($v>=3) {print("$line");}
|
|
227 if ($qname eq "" && $line=~/^Query:?\s*(\S+)(.*)/) {$qname=$1; $qnameline=$1.$2;}
|
|
228 if ($line=~/^Match_columns:?\s*(\S+)/) {$qmatch=$1; last;}
|
|
229 }
|
|
230 if (!($line=<INFILE>)) {die ("Error in $program: wrong format in $infile: $!\n");}
|
|
231
|
|
232
|
|
233 # Prepare hash %pick with indices of hits that will be transformed into model
|
|
234 # No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM
|
|
235 # 1 153l Lysozyme (E.C.3.2.1.17) 100.0 0 0 381.0 19.4 185 1-185 1-185 (185)
|
|
236 # 2 1qsa_A Soluble lytic transglyc 100.0 2.1E-39 2.5E-43 225.8 8.3 149 21-182 423-600 (618)
|
|
237 # 3 1ltm 36 kDa soluble lytic tr 95.9 3.5E-06 4.1E-10 50.3 11.0 95 28-122 76-221 (320)
|
|
238
|
|
239 # option '-m m1 m2 m3': pick models manually
|
|
240 my @pickhits = split(/\s+/,$pickhits);
|
|
241 $k=1;
|
|
242 foreach $hit (@pickhits) {
|
|
243 if (!defined $picked{$hit}) {$picked{$hit}=$k;}
|
|
244 $k++;
|
|
245 }
|
|
246
|
|
247 if ($outformat eq "AL" || $outformat eq "TS") {
|
|
248 &MakePairwiseAlignments();
|
|
249 } else {
|
|
250 &MakeMultipleAlignment();
|
|
251 }
|
|
252 exit;
|
|
253
|
|
254
|
|
255 ##################################################################################
|
|
256 # Construct AL or TS formatted alignment as a list of pairwise alignments
|
|
257 ##################################################################################
|
|
258 sub MakePairwiseAlignments()
|
|
259 {
|
|
260 # Scan through query-vs-template-alignments from infile and create first (combination) model
|
|
261 $hit=0; # counts hits in hit list
|
|
262 my $models=0;
|
|
263 while ($line=<INFILE>) {
|
|
264 if ($line=~/^>(\S+)/) {
|
|
265 $hit++;
|
|
266 if ($Pthr || $Ethr || defined $picked{$hit}) {
|
|
267 # Found right alignment (hit)
|
|
268 if (defined $picked{$hit}) {$k=$picked{$hit};} else {$k=$hit;}
|
|
269
|
|
270 if ($line=~/^>(.*?)\s+E=.*$/) {
|
|
271 $line=$1; # remove E=1.3E-30 etc. at the end
|
|
272 } else {
|
|
273 $line=~/^>(.*)/;
|
|
274 $line=$1;
|
|
275 }
|
|
276 my $nameline=$line;
|
|
277 my $evalue;
|
|
278 $line=<INFILE>;
|
|
279 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) {$score=$1; $evalue=$2}
|
|
280 else {$score=0; warn("WARNING: could not print score $line");}
|
|
281 if ($line=~/Aligned_cols=\s*(\S+)/) {;} else {warn("WARNING: could not find aligned_cols\n");}
|
|
282 if ($Pthr && $score<$Pthr) {last;} # Probability too low -> finished
|
|
283 if ($Ethr && $evalue>$Ethr) {last;} # Evalue too high > finished
|
|
284
|
|
285
|
|
286 # Commented out in CASP format
|
|
287 if ($formatting eq "LIVEBENCH") {
|
|
288 $printblock[$k] ="PFRMAT $outformat\n";
|
|
289 $printblock[$k].="TARGET $qname\n";
|
|
290 }
|
|
291
|
|
292 $remarks[$k]="REMARK $k: $nameline\n";
|
|
293 $remarks[$k].="REMARK $line";
|
|
294
|
|
295 &ReadAlignment();
|
|
296 $qfirst = $qfirst[0];
|
|
297 $qlast = $qlast[0];
|
|
298 $aaq = $qseq[0];
|
|
299 $tfirst = $tfirst[0];
|
|
300 $aat = $tseq[0];
|
|
301 $tname = $tname[0];
|
|
302
|
|
303 if ($v>=3) {
|
|
304 for (my $i=0; $i<@qfirst; $i++) {
|
|
305 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
|
|
306 }
|
|
307 printf("\n");
|
|
308 for (my $i=0; $i<@tfirst; $i++) {
|
|
309 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
|
|
310 }
|
|
311 printf("\n");
|
|
312 }
|
|
313
|
|
314 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
|
|
315 if (&ExtractPdbcodeAndChain($tname[0])) {next;}
|
|
316 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";}
|
|
317
|
|
318 # Read score (=probability)
|
|
319 $printblock[$k].="REMARK $nameline\n";
|
|
320 $printblock[$k].="REMARK $line";
|
|
321 $printblock[$k].="SCORE $score\n";
|
|
322 $printblock[$k].="PARENT $pdbcode\n";
|
|
323 $printblock[$k].="MODEL $k\n";
|
|
324
|
|
325 &WritePairwiseAlignments();
|
|
326 $printblock[$k].="END\n";
|
|
327 $models++;
|
|
328
|
|
329 }
|
|
330 }
|
|
331 }
|
|
332 $k=$#printblock; # set $k to last index in @printblock
|
|
333 if ($k<0) {
|
|
334 $printblock[1]="PARENT NONE\nTER\n";
|
|
335 $printblock[1].="END\n";
|
|
336 if ($v>=1) {print("WARNING: no hits found for model!\n");}
|
|
337 }
|
|
338 close (INFILE);
|
|
339
|
|
340 if ($v>=2) {
|
|
341 printf("$models models built\n");
|
|
342 }
|
|
343
|
|
344
|
|
345
|
|
346 # Write model file header
|
|
347
|
|
348 #---+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8
|
|
349
|
|
350 # Print header
|
|
351 my $date = scalar(localtime);
|
|
352 if ($formatting eq "CASP") {
|
|
353 $printblock[0]="PFRMAT $outformat\n";
|
|
354 $printblock[0].="TARGET $qname\n";
|
|
355 }
|
|
356 $printblock[0].="REMARK AUTHOR $servername\n";
|
|
357 $printblock[0].="REMARK $date\n";
|
|
358 # $printblock[0].="REMARK J. Soeding \n";
|
|
359
|
|
360 # Add remarks
|
|
361 for ($k=0; $k<@remarks; $k++) {
|
|
362 if (defined $remarks[$k]) {
|
|
363 $printblock[0].=$remarks[$k];
|
|
364 }
|
|
365 }
|
|
366 $printblock[0].="REMARK \n";
|
|
367
|
|
368
|
|
369 # Print @printblock into outfile
|
|
370
|
|
371 open (OUTFILE, ">$outfile") || die "Error in $program: Couldn't open $outfile: $!\n";
|
|
372 foreach my $printstr (@printblock) {
|
|
373 my @printarr=split(/\n/,$printstr);
|
|
374 if ($outformat eq "TS") {
|
|
375 foreach $printstr (@printarr) {
|
|
376 printf(OUTFILE "%-80.80s\n",$printstr);
|
|
377 }
|
|
378 } else {
|
|
379 foreach $printstr (@printarr) {
|
|
380 printf(OUTFILE "%s\n",$printstr);
|
|
381 }
|
|
382 }
|
|
383 }
|
|
384 close (OUTFILE);
|
|
385
|
|
386 if ($outformat eq "TS") {
|
|
387 # Call MaxSprout to generate sidechains
|
|
388 }
|
|
389 return;
|
|
390 }
|
|
391
|
|
392
|
|
393 ##################################################################################
|
|
394 # Construct multiple alignment in FASTA, A2M, or PIR format
|
|
395 ##################################################################################
|
|
396 sub MakeMultipleAlignment()
|
|
397 {
|
|
398 my @hitnames=(); # $hitnames[$k] is the nameline of the ihit'th hit
|
|
399 my @hitseqs=(); # $hitseqs[$k] contains the residues of the ihit'th hit
|
|
400 my @hitdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0]
|
|
401 my @conjnames=(); # $hitnames[$k] is the nameline of the ihit'th conjugate hit
|
|
402 my @conjseqs=(); # $hitseqs[$k] contains the residues of the ihit'th conjugate hit
|
|
403 my @conjdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0] for conjugate alignments
|
|
404
|
|
405 my $new_hit; # residues of new hit
|
|
406 my $i; # residue index
|
|
407 my $j; # residue index
|
|
408 my $k; # sequence index
|
|
409
|
|
410 $hitnames[0]="";
|
|
411 $hitseqs[0]="";
|
|
412 $hitdiag[0]=0;
|
|
413 $conjnames[0]="";
|
|
414 $conjseqs[0]="";
|
|
415 $conjdiag[0]=0;
|
|
416
|
|
417 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n";
|
|
418 $hit=0; # counts hits in hit list
|
|
419
|
|
420 # Read one alignment after the other
|
|
421 while ($line=<INFILE>) {
|
|
422
|
|
423 # Found new aligment
|
|
424 if ($line=~/^>(\S+)/) {
|
|
425 $hit++;
|
|
426
|
|
427 # Is alignment selected by user?
|
|
428 if ($Pthr || $Ethr || defined $picked{$hit}) {
|
|
429
|
|
430 if ($line=~/^>(\S+)(.*)/) {$tname=$1; $tnameline=$1.$2;}
|
|
431 else {die("\nError: bad format in $infile, line $.: code 1\n");}
|
|
432
|
|
433 $line = <INFILE>;
|
|
434 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) {
|
|
435 if ($Pthr && $1<$Pthr) {last;} # Probability too low -> finished
|
|
436 if ($Ethr && $2>$Ethr) {last;} # Evalue too high > finished
|
|
437 } else { die("\nError: bad format in $infile, line $.: code 2\n"); }
|
|
438
|
|
439 # Read next alignment with $aaq, $qfirst, @tseq, @first, and @tname
|
|
440 &ReadAlignment();
|
|
441 chomp($tnameline);
|
|
442 if ($tnameline=~/\S+\s+(.*)/) {$tname[0].=" $1";} # template seed sequence gets its description
|
|
443
|
|
444 # Format sequences into @hitseqs and @hitnames
|
|
445 &FormatSequences(\@hitnames,\@hitseqs,\@hitdiag,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength);
|
|
446
|
|
447 # Use conjugate alignments?
|
|
448 if ($conj>0) {
|
|
449 &FormatSequences(\@conjnames,\@conjseqs,\@conjdiag,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength);
|
|
450 }
|
|
451
|
|
452 } # end: if ($Pthr>0 || defined $picked{$hit})
|
|
453
|
|
454 } # end: if ($line=~/^>(\S+)/) # found new alignment
|
|
455
|
|
456 } # end while
|
|
457 close (INFILE);
|
|
458
|
|
459 # Insert full-length query sequence?
|
|
460 if ($qfile) {
|
|
461 $hitseqs[0]="";
|
|
462 open (QFILE, "<$qfile") || die "Error in $program: Couldn't open $qfile: $!\n";
|
|
463 while ($line=<QFILE>) {
|
|
464 if ($line=~/^>/ && $line!~/^>ss_/ && $line!~/^>sa_/ && $line!~/^>aa_/ && $line!~/^>Consensus/) {last;}
|
|
465 }
|
|
466 while ($line=<QFILE>) {
|
|
467 if ($line=~/^>/ || $line=~/^\#/) {last;}
|
|
468 $line=~tr/\n\.-//d;
|
|
469 $line=~tr/a-z/A-Z/;
|
|
470 $hitseqs[0].=$line;
|
|
471 }
|
|
472 close(QFILE);
|
|
473 if ($v>=2) {printf("\nQ(full) %-14.14s %s\n",$qname,$hitseqs[0]);}
|
|
474 }
|
|
475
|
|
476
|
|
477 # DEBUG
|
|
478 if ($v>=3) {
|
|
479 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]);
|
|
480 for ($k=1; $k<@hitnames; $k++) {
|
|
481 printf("T hit %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
|
|
482 }
|
|
483 printf("\n");
|
|
484 printf("\nQuery %-14.14s %s\n",$qname,$conjseqs[0]);
|
|
485 for ($k=1; $k<@conjnames; $k++) {
|
|
486 printf("T conj %3i %-14.14s %s\n",$k,$conjnames[$k],$conjseqs[$k]);
|
|
487 }
|
|
488 printf("\n");
|
|
489 }
|
|
490
|
|
491
|
|
492 # Include conjugate sequences?
|
|
493 if ($conj>0) {
|
|
494 shift(@conjseqs); # delete zeroth ("query") sequence of @conjseqs
|
|
495 shift(@conjnames); #
|
|
496 shift(@conjdiag); #
|
|
497
|
|
498 # Sort by diagonals $hitdiag[], $conjdiag[]
|
|
499 &Sort(\@hitdiag,\@hitseqs,\@hitnames);
|
|
500 &Sort(\@conjdiag,\@conjseqs,\@conjnames);
|
|
501
|
|
502 # Append conjugate sequences to hitseqs
|
|
503 splice(@hitseqs,scalar(@hitseqs),0,@conjseqs);
|
|
504 splice(@hitnames,scalar(@hitnames),0,@conjnames);
|
|
505
|
|
506 if ($v>=3) {
|
|
507 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]);
|
|
508 for ($k=1; $k<@hitnames; $k++) {
|
|
509 chomp($hitnames[$k]);
|
|
510 printf("T tot %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
|
|
511 $hitnames[$k].="\n";
|
|
512 }
|
|
513 }
|
|
514 }
|
|
515
|
|
516 # Insert gaps:
|
|
517
|
|
518 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j.
|
|
519 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state
|
|
520 my $insert;
|
|
521 my $ngap;
|
|
522
|
|
523 # For each match state determine length of LONGEST insert after this match state and store in @len_ins
|
|
524 for ($k=0; $k<@hitnames; $k++) {
|
|
525 # split into list of single match states and variable-length inserts
|
|
526 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
|
|
527 # The '#' symbol is prepended to get rid of a perl bug in split
|
|
528 $j=0;
|
|
529 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
|
|
530 # printf("Sequence $k: $hitseqs[$k]\n");
|
|
531 # printf("Sequence $k: @inserts\n");
|
|
532 foreach $insert (@inserts) {
|
|
533 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {
|
|
534 $len_ins[$j]=length($insert);
|
|
535 }
|
|
536 $j++;
|
|
537 # printf("$insert|");
|
|
538 }
|
|
539 # printf("\n");
|
|
540 }
|
|
541
|
|
542
|
|
543
|
|
544 # After each match state insert residues and fill up with gaps to $len_ins[$i] characters
|
|
545 for ($k=0; $k<@hitnames; $k++) {
|
|
546 # split into list of single match states and variable-length inserts
|
|
547 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
|
|
548 $j=0;
|
|
549
|
|
550 # append the missing number of gaps after each match state
|
|
551 foreach $insert (@inserts) {
|
|
552 if($outformat eq "FASTA") {
|
|
553 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.="-";}
|
|
554 }
|
|
555 else {
|
|
556 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.=".";}
|
|
557 }
|
|
558 $j++;
|
|
559 }
|
|
560 $hitseqs[$k] = join("",@inserts);
|
|
561 $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
|
|
562 }
|
|
563
|
|
564 # Remove columns at beginning and end with gaps in all sequences
|
|
565 my $remove_start;
|
|
566 my $remove_end;
|
|
567 my $len;
|
|
568 $hitseqs[0]=~/^(-*)/;
|
|
569 $remove_start=length($1);
|
|
570 $hitseqs[0]=~/(-*)$/;
|
|
571 $remove_end=length($1);
|
|
572 for ($k=0; $k<@hitnames; $k++) {
|
|
573 $hitseqs[$k]=~s/^.{$remove_start}(.*).{$remove_end}/$1/;
|
|
574 }
|
|
575 $len=($hitseqs[0]=~tr/a-zA-Z/a-zA-Z/);
|
|
576
|
|
577 # Prepare name line of query
|
|
578 if ($outformat eq "PIR") {
|
|
579 my $qnametmp=$qname;
|
|
580 $qnametmp=~tr/:/;/;
|
|
581 $qnameline=~/^\S+\s*(.*)/;
|
|
582 my $qnamelinetmp=$1;
|
|
583 $qnamelinetmp=~tr/:/;/;
|
|
584 $hitnames[0] = sprintf(">P1;%s\nsequence:%s:%4i: :%4i: :%s: : 0.00: 0.00\n",$qnametmp,$qnametmp,$remove_start+1,$len+$remove_start,$qnamelinetmp);
|
|
585 } else {
|
|
586 # outformat is "FASTA" or "A2M" or "A3M" or ...
|
|
587 $hitnames[0] = ">$qnameline\n";
|
|
588 }
|
|
589
|
|
590 # If pretty diagonally sorted order is wanted...
|
|
591 if ($conj>0) {
|
|
592 if ($conj==2) {
|
|
593 my $center = 0.5*(scalar(@hitseqs)-1);
|
|
594 @conjseqs = splice(@hitseqs,$center+1,$center);
|
|
595 splice(@hitseqs,0,0,@conjseqs);
|
|
596 @hitseqs = reverse(@hitseqs);
|
|
597
|
|
598 @conjnames = splice(@hitnames,$center+1,$center);
|
|
599 splice(@hitnames,0,0,@conjnames);
|
|
600 @hitnames = reverse(@hitnames);
|
|
601 # Shorten namelines of all but first sequence
|
|
602 my %count;
|
|
603 for ($k=0; $k<@hitnames; $k++) {
|
|
604 if ($k==$center) {$k++;}
|
|
605 $hitnames[$k]=~/(\S{1,14})/;
|
|
606 if (!defined $count{$1}) {$count{$1}=0;}
|
|
607 my $count = ++$count{$1};
|
|
608 # printf("vorher: %s ",$hitnames[$k]);
|
|
609 $hitnames[$k]=~s/^(\S{1,14}).*/$1:$count/;
|
|
610 # printf("nachher: %s\n",$hitnames[$k]);
|
|
611 }
|
|
612 } else {
|
|
613 for ($k=0; $k<@hitnames; $k++) {$hitnames[$k]=">$qname\n";}
|
|
614 }
|
|
615 }
|
|
616
|
|
617 # Remove gaps? Captialize?
|
|
618 if ($outformat eq "PIR") {
|
|
619 for ($k=0; $k<@hitnames; $k++) {
|
|
620 $hitseqs[$k].="*";; # Transform to upper case
|
|
621 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case
|
|
622 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions
|
|
623 }
|
|
624 } elsif ($outformat eq "FASTA") {
|
|
625 for ($k=0; $k<@hitnames; $k++) {
|
|
626 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case
|
|
627 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions
|
|
628 }
|
|
629 } elsif ($outformat eq "A2M") {
|
|
630 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g;} # insert newlines every NUMRES positions
|
|
631 } elsif ($outformat eq "A3M") {
|
|
632 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~tr/.//d;$hitseqs[$k].="\n";} # Remove gaps aligned to inserts
|
|
633 }
|
|
634
|
|
635 # Write sequences into output file
|
|
636 open (OUTFILE, ">$outfile") || die ("cannot open $outfile:$!");
|
|
637 for ($k=0; $k<@hitnames; $k++) {
|
|
638 print(OUTFILE "$hitnames[$k]$hitseqs[$k]");
|
|
639 }
|
|
640 close OUTFILE;
|
|
641
|
|
642
|
|
643 if ($v>=2) {
|
|
644 printf("%i sequences written to $outfile\n",scalar(@hitnames));
|
|
645 }
|
|
646 }
|
|
647
|
|
648
|
|
649 # Format sequences into @hitseqs and @hitnames
|
|
650 # & Call with FormatSequences(\@hitnames,\@hitseqs,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength);
|
|
651 sub FormatSequences()
|
|
652 {
|
|
653 my $p_hitnames = $_[0]; # typeglob to $hitname
|
|
654 my $p_hitseqs = $_[1]; # ...
|
|
655 my $p_hitdiag = $_[2]; # ...
|
|
656
|
|
657 my $p_qname = $_[3]; #
|
|
658 my $p_qseq = $_[4]; #
|
|
659 my $p_qfirst = $_[5]; #
|
|
660 my $p_qlast = $_[6]; #
|
|
661 my $p_qlength = $_[7]; #
|
|
662
|
|
663 my $p_tname = $_[8]; #
|
|
664 my $p_tseq = $_[9]; #
|
|
665 my $p_tfirst = $_[10]; #
|
|
666 my $p_tlast = $_[11]; #
|
|
667 my $p_tlength = $_[12]; #
|
|
668 my $i;
|
|
669
|
|
670 if ($v>=2) {
|
|
671 if (defined $picked{$hit}) {
|
|
672 print("hit=$hit picked=$picked{$hit} tname=$tname[0]");
|
|
673 } else {
|
|
674 print("hit=$hit picked=evalue<$Ethr tname=$tname[0]");
|
|
675 }
|
|
676 for (my $i=1; $i<@{$p_tname}; $i++) {
|
|
677 print(", $tname[$i]");
|
|
678 }
|
|
679 print("\n");
|
|
680 }
|
|
681
|
|
682
|
|
683 my $qfirst = ${$p_qfirst}[0];
|
|
684 my $qlast = ${$p_qlast}[0];
|
|
685 my $qlength = ${$p_qlength};
|
|
686 my $aaq = ${$p_qseq}[0];
|
|
687
|
|
688 @aaq = unpack("C*",$aaq); # needed for transforming template sequences into a3m based on query residues (NOT HMM match states!)
|
|
689 $aaq=~tr/.-//d; # remove all gaps from query sequence
|
|
690
|
|
691 # For all template sequences in the present alignment
|
|
692 for (my $k=0; $k<@{$p_tname}; $k++) {
|
|
693
|
|
694 $tname =${$p_tname}[$k];
|
|
695 $tfirst=${$p_tfirst}[$k];
|
|
696 $aat =${$p_tseq}[$k];
|
|
697
|
|
698 # Transform template residues into a3m format:
|
|
699 # match states are those where query has residue (NOT where HMM has match state!)
|
|
700 # This makes sense since we want to build a model for the query sequence.
|
|
701 @aat = unpack("C*",$aat);
|
|
702 $aat="";
|
|
703
|
|
704 # Transform all columns with residue in query into match/delete states, all others to inserts
|
|
705 for ($i=0; $i<scalar(@aaq); $i++) {
|
|
706 if ($aaq[$i]!=45 && $aaq[$i]!=46) { # no gap in query
|
|
707 if($aat[$i]==46) {
|
|
708 $aat.="-"; # transform '.' to '-' if aligned with a query residue
|
|
709 } else {
|
|
710 $aat .= uc(chr($aat[$i])); # UPPER case if aligned with a query residue (match state)
|
|
711 }
|
|
712 } else {
|
|
713 if($aat[$i]!=45 && $aat[$i]!=46) { # no gap in template?
|
|
714 $aat.=lc(chr($aat[$i])); # lower case if aligned with a gap in the query (insert state)
|
|
715 }
|
|
716 }
|
|
717 }
|
|
718
|
|
719 if ($v>=2) {
|
|
720 printf("\nQ %-14.14s %s\n",$qname,$aaq);
|
|
721 printf("T %-14.14s %s\n",$tname,$aat);
|
|
722 }
|
|
723
|
|
724 # Outformat is PIR? => read residues and indices from PDB ATOM records
|
|
725 if ($outformat eq "PIR") {
|
|
726
|
|
727 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
|
|
728 if (&ExtractPdbcodeAndChain($tname)) {next;}
|
|
729
|
|
730 # Read sequence from pdb file
|
|
731 if (!open (PDBFILE, "$pdbfile")) {
|
|
732 die ("Error in $program: Couldn't open $pdbfile: $!\n");
|
|
733 }
|
|
734 $aapdb="";
|
|
735 $l=0;
|
|
736 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l]
|
|
737 my $nres=-1e6;
|
|
738 my $resolution=-1.00;
|
|
739 my $rvalue=-1.00;
|
|
740 while ($line=<PDBFILE>) {
|
|
741 if ($line=~/^REMARK.*RESOLUTION\.\s+(\d+\.?\d*)/) {$resolution=$1;}
|
|
742 if ($line=~/^REMARK.*R VALUE\s+\(WORKING SET\)\s+:\s+(\d+\.?\d*)/) {$rvalue=$1;}
|
|
743 if ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one
|
|
744 if (($line=~/^ATOM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ ||
|
|
745 ($line=~/^HETATM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ && &Three2OneLetter($1) ne "X") ) &&
|
|
746 $2 ne $nres ) {
|
|
747 $res=$1;
|
|
748 $nres=$2;
|
|
749 $nres[$l]=$2;
|
|
750 $res=&Three2OneLetter($res);
|
|
751 $aapdb[$l++]=$res;
|
|
752 $aapdb.=$res;
|
|
753 }
|
|
754 }
|
|
755 close (PDBFILE);
|
|
756 if (length($aapdb)<=0) {die("Error: chain $chain not found in pdb file $pdbfile\n");}
|
|
757
|
|
758 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb)
|
|
759 my $xseq=$aat;
|
|
760 $xseq=~tr/-/~/; # transform Deletes to '~' to distinguish them from gaps '-' inserted by Align.pm
|
|
761 my $yseq=$aapdb;
|
|
762 my ($jmin,$jmax,$lmin,$lmax);
|
|
763 my $Sstr;
|
|
764 my $score;
|
|
765 # The aligned characters are returend in $j2[$col2] and $l2[$col2]
|
|
766 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr);
|
|
767
|
|
768 # DEBUG
|
|
769 if ($v>=3) {
|
|
770 printf("Template (hh) $xseq\n");
|
|
771 printf("Identities $Sstr\n");
|
|
772 printf("Template (pdb) $yseq\n");
|
|
773 printf("\n");
|
|
774 if ($v>=4) {
|
|
775 for ($col2=0; $col2<@l2 && $col2<1000; $col2++) {
|
|
776 printf("%3i %3i:%s %3i:%s -> %i\n",$col2,$j2[$col2],substr($aat,$j2[$col2]-1,1),$l2[$col2],substr($aapdb,$l2[$col2]-1,1),$nres[$l2[$col2]-1]);
|
|
777 }
|
|
778 }
|
|
779 }
|
|
780
|
|
781 # check for reasonable alignment
|
|
782 my $num_match = 0;
|
|
783 for ($i=0; $i<@l2; $i++) {
|
|
784 if ($j2[$i] > 0 && $l2[$i] > 0) {
|
|
785 $num_match++;
|
|
786 }
|
|
787 }
|
|
788 if (($score/$num_match) < 1) {
|
|
789 print "WARNING! Match score with PDBfile (score: $score num: $num_match score/num:".($score/$num_match).") to low => $pdbfile not included!\n";
|
|
790 next;
|
|
791 }
|
|
792
|
|
793 # Assign a3m-formatted amino acid sequence from pdb file to $aapdb
|
|
794 $aapdb="";
|
|
795 my @xseq=unpack("C*",$xseq);
|
|
796 my @yseq=unpack("C*",$yseq);
|
|
797 for ($i=0; $i<@yseq; $i++) {
|
|
798 if(($xseq[$i]>=65 && $xseq[$i]<=90) || $xseq[$i]==ord('~')) { # if $aat has upper case residue or Delete state
|
|
799 # Match state
|
|
800 $aapdb.=uc(chr($yseq[$i]));
|
|
801 } else {
|
|
802 # Insert state
|
|
803 if ($yseq[$i]!=45) {$aapdb.=lc(chr($yseq[$i]));} # add only if not a gap '-'
|
|
804 }
|
|
805 }
|
|
806
|
|
807 # Remove overlapping ends of $aapdb
|
|
808 $aapdb=~s/^[a-z]*(.*?)[a-z]*$/$1/;
|
|
809
|
|
810 # Glue gaps at beginning and end of aligned pdb sequence and add sequence to alignment
|
|
811 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aapdb.("-" x ($qlength-$qlast)) ); # use ATOM record residues $aapdb!
|
|
812
|
|
813 # Write hitname in PIR format into @hitnames
|
|
814 my $descr;
|
|
815 my $organism;
|
|
816 my $struc=$pdbcode;
|
|
817 if ($tnameline=~/^(\S+)\s+(.*)/) {$descr=$2; $descr=~tr/://d;} else {$descr=" ";}
|
|
818 if ($tnameline=~/^(\S+)\s+.*\s+\{(.*)\}/) {$organism=$2;} else {$organism=" ";}
|
|
819 if (length($chain)>1 || $chain eq ".") { # MODELLER's special symbol for 'chain unspecified'
|
|
820 $chain=".";
|
|
821 } elsif ($addchain && $chain ne " ") {
|
|
822 $struc.="_$chain";
|
|
823 }
|
|
824 # push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s:%4i:%1s:%4i:%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$nres[$lmin-1],$chain,$nres[$lmax-1],$chain,$descr,$organism,$resolution,$rvalue) );
|
|
825
|
|
826 my $descrtmp=$descr;
|
|
827 $descrtmp=~tr/:/;/;
|
|
828 $organism=~tr/://d;
|
|
829 push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s: :%1s: :%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$chain,$chain,$descrtmp,$organism,$resolution,$rvalue) );
|
|
830 push (@{$p_hitdiag}, $tfirst-$qfirst);
|
|
831 } else {
|
|
832 # outformat is "FASTA" or "A2M" or "A3M" or ...
|
|
833 # Write hitname in FASTA format into @hitnames
|
|
834 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aat.("-" x ($qlength-$qlast)) );
|
|
835 push (@{$p_hitnames}, ">$tname\n" );
|
|
836 push (@{$p_hitdiag}, $tfirst-$qfirst);
|
|
837 }
|
|
838
|
|
839 if ($onlyfirst>0) {last;} # extract only first (seed?) sequence in each alignment
|
|
840
|
|
841 } # end: for (my $k=0; $k<@{$tname}; $k++)
|
|
842
|
|
843 # Paste aligned subsequence of query over $hitseqs[0]
|
|
844 if (${$p_hitseqs}[0] eq "") {${$p_hitseqs}[0] = "-" x $qlength;}
|
|
845 if (!$qfile) {substr(${$p_hitseqs}[0],$qfirst-1,length($aaq),$aaq);}
|
|
846
|
|
847 return;
|
|
848 }
|
|
849
|
|
850
|
|
851 ##################################################################################
|
|
852 # Read Alignment from infile (*.hhr file)
|
|
853 # Results:
|
|
854 # $aaq: query residues in present alignment
|
|
855 # $qfirst: index of first query residue in present alignment
|
|
856 # @tname: template names in present alignmen
|
|
857 # @tfirst: indices of first residues in present alignmet
|
|
858 # @tseq: sequences of templates in present alignment
|
|
859 ##################################################################################
|
|
860 sub ReadAlignment() {
|
|
861
|
|
862 @qname=(); # name of $it'th query in this alignment
|
|
863 @qfirst=(); # index of first residue in $it'th query in this alignment
|
|
864 @qlast=(); # index of last residue in $it'th query in this alignment
|
|
865 @qseq=(); # residues of $it'th query in this alignment
|
|
866
|
|
867 @tname=(); # name of $it'th template in this alignment
|
|
868 @tfirst=(); # index of first residue in $it'th template in this alignment
|
|
869 @tlast=(); # index of last residue in $it'th template in this alignment
|
|
870 @tseq=(); # residues of $it'th template in this alignment
|
|
871
|
|
872 if ($v>=3) {printf("Searching for Q $qname vs T $tname\n");}
|
|
873 $line=<INFILE>;
|
|
874
|
|
875 # Search for first line beginning with Q ot T and not followed by aa_, ss_pred, ss_conf, or Consensus
|
|
876 while (1) {
|
|
877 my $i; # index for query sequence in this alignment
|
|
878 # Scan up to first line starting with Q; stop when line 'No\s+\d+' or 'Done' is found
|
|
879 while (defined $line && $line!~/^Q\s(\S+)/) {
|
|
880 if ($line=~/^No\s+\d/ || $line=~/^Done/) {last;}
|
|
881 $line=<INFILE>; next;
|
|
882 }
|
|
883 if (!defined $line || $line=~/^No\s+\d/ || $line=~/^Done/) {last;}
|
|
884
|
|
885 # Scan up to first line that is not secondary structure line or consensus line
|
|
886 while (defined $line && $line=~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;}
|
|
887
|
|
888 # Read next block of query sequences
|
|
889 $i=0;
|
|
890 while ($line=~/^Q\s+/) {
|
|
891 if ($line!~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^Q\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/) {
|
|
892 $qname[$i]=$1;
|
|
893 if (!$qfirst[$i]) {$qfirst[$i]=$2;} # if $qfirst is undefined then this is the first alignment block -> set $qfirst to $1
|
|
894 if (!$qseq[$i]) {$qseq[$i]=$3;} else {$qseq[$i].=$3;}
|
|
895 $qlast[$i]=$4;
|
|
896 if ($i==0) {$qlength=$5}
|
|
897 $i++;
|
|
898 }
|
|
899 $line=<INFILE>;
|
|
900 }
|
|
901 if ($i==0) {
|
|
902 die("\nError in $program: bad format in $infile, line $.: query block\n");
|
|
903 }
|
|
904
|
|
905 # Scan up to first line starting with T
|
|
906 while (defined $line && $line!~/^T\s+(\S+)/) {$line=<INFILE>;}
|
|
907
|
|
908 # Scan up to first line that is not secondary structure line or consensus line
|
|
909 while (defined $line && $line=~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;}
|
|
910
|
|
911 # Read next block of template sequences
|
|
912 $i=0;
|
|
913 while ($line=~/^T\s+/) {
|
|
914 if ($line!~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^T\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/){
|
|
915 $tname[$i]=$1;
|
|
916 if (!$tfirst[$i]) {$tfirst[$i]=$2;} # if $tfirst is undefined then this is the first alignment block -> set $tfirst to $1
|
|
917 if (!$tseq[$i]) {$tseq[$i]=$3;} else {$tseq[$i].=$3;}
|
|
918 $tlast[$i]=$4;
|
|
919 if ($i==0) {$tlength=$5}
|
|
920 $i++;
|
|
921 }
|
|
922 $line=<INFILE>;
|
|
923 }
|
|
924 if ($i==0) {
|
|
925 die("\nError in $program: bad format in $infile, line $.: template block\n");
|
|
926 }
|
|
927
|
|
928 } # end while ($line=<INFILE>)
|
|
929
|
|
930 # if (!$qfirst) {$qfirst=1;} # if still $qfirst==0 then set $qfirst to 1
|
|
931 # for (my $i=0; $i<@tfirst; $i++) {
|
|
932 # if (!$tfirst[$i]) {$tfirst[$i]=1;} # if still $tfirst[$i]==0 then set $tfirst to 1
|
|
933 # }
|
|
934
|
|
935 # Check lengths
|
|
936 if (length($qseq[0])!=length($tseq[0])) {
|
|
937 print("\nError: query and template lines do not have the same length in $infile, line $.\n");
|
|
938 for (my $i=0; $i<@qfirst; $i++) {
|
|
939 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
|
|
940 }
|
|
941 printf("\n");
|
|
942 for (my $i=0; $i<@tfirst; $i++) {
|
|
943 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
|
|
944 }
|
|
945 printf("\n");
|
|
946 exit 1;
|
|
947 }
|
|
948
|
|
949 if ($v>=3) {
|
|
950 for (my $i=0; $i<@qfirst; $i++) {
|
|
951 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]);
|
|
952 }
|
|
953 printf("\n");
|
|
954 for (my $i=0; $i<@tfirst; $i++) {
|
|
955 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]);
|
|
956 }
|
|
957 printf("\n");
|
|
958 }
|
|
959 return;
|
|
960 }
|
|
961
|
|
962
|
|
963 ##################################################################################
|
|
964 # Write Alignment to $printblock[$k]
|
|
965 ##################################################################################
|
|
966 sub WritePairwiseAlignments() {
|
|
967
|
|
968 #Delete columns with gaps in both sequences
|
|
969 $aaq=uc($aaq);
|
|
970 $aat=uc($aat);
|
|
971 @aaq=split(//,$aaq);
|
|
972 @aat=split(//,$aat);
|
|
973 my $col=0;
|
|
974 for ($col1=0; $col1<@aaq; $col1++) {
|
|
975 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/ || $aat[$col1]=~tr/a-zA-Z/a-zA-Z/) {
|
|
976 $aaq[$col]=$aaq[$col1];
|
|
977 $aat[$col]=$aat[$col1];
|
|
978 $col++;
|
|
979 }
|
|
980 }
|
|
981 splice(@aaq,$col); # delete end of @aaq;
|
|
982 splice(@aat,$col);
|
|
983 $aaq=join("",@aaq);
|
|
984 $aat=join("",@aat);
|
|
985
|
|
986
|
|
987 # Count query and template residues into @i1 and @j1
|
|
988 for ($col1=0; $col1<@aaq; $col1++) {
|
|
989 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/) {
|
|
990 $i1[$col1]=$qfirst++; #found query residue in $col1
|
|
991 } else {
|
|
992 $i1[$col1]=0; #found gap in $col1
|
|
993 }
|
|
994 if ($aat[$col1]=~tr/a-zA-Z/a-zA-Z/) {
|
|
995 $j1[$col1]=$tfirst++; #found template residue in $col1
|
|
996 } else {
|
|
997 $j1[$col1]=0; #found gap in $col1
|
|
998 }
|
|
999 }
|
|
1000
|
|
1001
|
|
1002 # DEBUG
|
|
1003 if ($v>=3) {
|
|
1004 printf ("col Q i1 T j1\n");
|
|
1005 for ($col1=0; $col1<@aaq; $col1++) {
|
|
1006 printf ("%3i %s %3i %s %3i\n",$col1,$aaq[$col1],$i1[$col1],$aat[$col1],$j1[$col1]);
|
|
1007 }
|
|
1008 printf ("\n");
|
|
1009 }
|
|
1010
|
|
1011
|
|
1012 # Read protein chain from pdb file
|
|
1013 # ----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8
|
|
1014 # ATOM 1 N SER A 27 38.637 79.034 59.693 1.00 79.70 # ATOM 2083 CD1 LEU A 22S 15.343 -12.020 43.761 1.00 5.00 C
|
|
1015
|
|
1016 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
|
|
1017 if (&ExtractPdbcodeAndChain($tname)) {next;}
|
|
1018
|
|
1019 # Read sequence from pdb file
|
|
1020 if (! defined $pdbfile) {die ("Error in $program: Couldn't find pdb code in $tname\n");}
|
|
1021 open (PDBFILE, "$pdbfile") || die ("Error in $program: Couldn't open $pdbfile: $!\n");
|
|
1022 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";}
|
|
1023 $aapdb=""; $l=1;
|
|
1024 $line=<PDBFILE>;
|
|
1025 while ($line) {if ($line=~/^ATOM/) {last;} $line=<PDBFILE>;} # advance to ATOM records
|
|
1026 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l]
|
|
1027 my @coord; # $coord[$l] = coordinates of CA atom of residue $aapdb[$l]
|
|
1028 while ($line) {
|
|
1029 if ($line=~/^ATOM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ ||
|
|
1030 ($line=~/^HETATM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ && &Three2OneLetter($1) ne "X") ) {
|
|
1031 $res=$1;
|
|
1032 $nres[$l]=$2;
|
|
1033 $coord[$l]=$3." 1.00";
|
|
1034 $res=&Three2OneLetter($res);
|
|
1035 $aapdb[$l]=$res;
|
|
1036 $aapdb.=$res;
|
|
1037 $l++;
|
|
1038 }
|
|
1039 elsif ($l>10 && $line=~/^ATOM\s+\d+ CA/) {last;}
|
|
1040 elsif ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one
|
|
1041 $line=<PDBFILE>;
|
|
1042 }
|
|
1043 close (PDBFILE);
|
|
1044
|
|
1045 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb)
|
|
1046
|
|
1047 my $xseq=$aat;
|
|
1048 my $yseq=$aapdb;
|
|
1049 my ($jmin,$jmax,$lmin,$lmax);
|
|
1050 my $Sstr;
|
|
1051 my $score;
|
|
1052 $xseq=~tr/-/~/d; # transform Deletes to '~' to distinguish them from gaps inserted by Align.pm
|
|
1053 #the aligned characters are returend in $j2[$col2] and $l2[$col2]
|
|
1054
|
|
1055 if ($v>=3) {
|
|
1056 printf("Template (hh) $xseq\n");
|
|
1057 printf("Identities $Sstr\n");
|
|
1058 printf("Template (pdb) $yseq\n");
|
|
1059 printf("\n");
|
|
1060 }
|
|
1061
|
|
1062 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr);
|
|
1063
|
|
1064 # DEBUG
|
|
1065 if ($v>=3) {
|
|
1066 printf("Template (hh) $xseq\n");
|
|
1067 printf("Identities $Sstr\n");
|
|
1068 printf("Template (pdb) $yseq\n");
|
|
1069 printf("\n");
|
|
1070 if ($v>=4) {
|
|
1071 for ($col2=0; $col2<@l2 && $col2<200; $col2++) {
|
|
1072 printf("%3i %3i %3i\n",$col2,$j2[$col2],$l2[$col2]);
|
|
1073 }
|
|
1074 }
|
|
1075 }
|
|
1076
|
|
1077 # DEBUG
|
|
1078
|
|
1079 # Construct alignment of $aaq <-> $aapdb via alignments $aaq <-> $aat and $aat <-> $aapdb:
|
|
1080 # Find $l1[$col1] = line of pdb file corresponding to residue $aat[$col1] and $aaq[$col1]
|
|
1081 $col2=0;
|
|
1082 for ($col1=0; $col1<@aaq; $col1++) {
|
|
1083 if ($j1[$col1]==0 || $i1[$col1]==0) {$l1[$col1]=0; next;} # skip gaps in query and gaps in template
|
|
1084 while ($j2[$col2]<$col1+1) {$col2++;} # in $j2[col2] first index is 1, in $col1 first column is 0
|
|
1085 $l1[$col1] = $l2[$col2];
|
|
1086 if ($v>=4) {printf("l1[%i]=%i l2[%i]=%i\n",$col1,$l1[$col1],$col2,$l2[$col2]);}
|
|
1087 }
|
|
1088
|
|
1089
|
|
1090 if ($pdbcode ne "NONE") {
|
|
1091 if ($outformat eq "TS") {
|
|
1092 for ($col1=0; $col1<@aat; $col1++) {
|
|
1093 if ($i1[$col1]==0) {next;} # skip gaps in query
|
|
1094 if ($j1[$col1]==0) {next;} # skip gaps in template sequence
|
|
1095 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file
|
|
1096
|
|
1097 $printblock[$k].=sprintf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]);
|
|
1098 if ($v>=4) {
|
|
1099 printf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]);
|
|
1100 }
|
|
1101 }
|
|
1102 } else {
|
|
1103 for ($col1=0; $col1<@aat; $col1++) {
|
|
1104 if ($i1[$col1]==0) {next;} # skip gaps in query
|
|
1105 if ($j1[$col1]==0) {next;} # skip gaps in template sequence
|
|
1106 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file
|
|
1107 $printblock[$k].=sprintf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]);
|
|
1108 if ($v>=4) {printf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]);}
|
|
1109 }
|
|
1110 }
|
|
1111 }
|
|
1112 $printblock[$k].=sprintf("TER\n");
|
|
1113 return;
|
|
1114 }
|
|
1115
|
|
1116
|
|
1117
|
|
1118 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain
|
|
1119 sub ExtractPdbcodeAndChain()
|
|
1120 {
|
|
1121 my $name=$_[0];
|
|
1122 $name=~/^(\S+)/;
|
|
1123 $name=$1;
|
|
1124
|
|
1125 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1)
|
|
1126 if ($name=~/^[defgh](\d[a-z0-9]{3})([a-z0-9_.])[a-z0-9_]$/) {
|
|
1127 $pdbcode=$1;
|
|
1128 if ($2 eq "_") {$chain="[A ]";} else {$chain=uc($2);}
|
|
1129 }
|
|
1130
|
|
1131 # PDB ID? (8fab, 1a0i)
|
|
1132 elsif ($name=~/^(\d[a-z0-9]{3})$/) {
|
|
1133 $pdbcode=$1;
|
|
1134 $chain="[A ]";
|
|
1135 }
|
|
1136
|
|
1137 # PDB ID? (8fab_A)
|
|
1138 elsif ($name=~/^(\d[a-z0-9]{3})_(\S)$/) {
|
|
1139 $pdbcode=$1;
|
|
1140 $chain=$2;
|
|
1141 }
|
|
1142
|
|
1143 # PDB ID? (1u1z_ABC)
|
|
1144 elsif ($name=~/^(\d[a-z0-9]{3})_(\S\S+)$/) {
|
|
1145 $pdbcode=$1;
|
|
1146 $chain="[$2]";
|
|
1147 }
|
|
1148
|
|
1149 # DALI ID? (8fabA_0,1a0i_2)
|
|
1150 elsif ($name=~/^(\d[a-z0-9]{3})([A-Za-z0-9]?)_\d+$/) {
|
|
1151 $pdbcode=$1;
|
|
1152 $chain=$2;
|
|
1153 }
|
|
1154
|
|
1155 else {
|
|
1156 $pdbcode=$name;
|
|
1157 $chain="A";
|
|
1158 # return 1; # no SCOP/DALI/pdb sequence
|
|
1159 }
|
|
1160
|
|
1161 &FindPDBfile($pdbcode);
|
|
1162
|
|
1163 if ($pdbfile eq "") {
|
|
1164 if ($v>=2) {print("Warning: no pdb file found for sequence name '$name'\n");}
|
|
1165 return 1;
|
|
1166 }
|
|
1167 return 0;
|
|
1168 }
|
|
1169
|
|
1170
|
|
1171 # Resort arrays according to sorting array0:
|
|
1172 # Resort(\@array0,\@array1,...,\@arrayN)
|
|
1173 sub Sort()
|
|
1174 {
|
|
1175 my $p_array0 = $_[0];
|
|
1176 my @index=();
|
|
1177 for (my $i=0; $i<@{$p_array0}; $i++) {$index[$i]=$i;}
|
|
1178 @index = sort { ${$p_array0}[$a] <=> ${$p_array0}[$b] } @index;
|
|
1179 foreach my $p_array (@_) {
|
|
1180 my @dummy = @{$p_array};
|
|
1181 @{$p_array}=();
|
|
1182 foreach my $i (@index) {
|
|
1183 push(@{$p_array}, $dummy[$i]);
|
|
1184 }
|
|
1185 }
|
|
1186 }
|
|
1187
|
|
1188 ##################################################################################
|
|
1189 # Convert three-letter amino acid code into one-letter code
|
|
1190 ##################################################################################
|
|
1191 sub Three2OneLetter {
|
|
1192 my $res=uc($_[0]);
|
|
1193 if ($res eq "GLY") {return "G";}
|
|
1194 elsif ($res eq "ALA") {return "A";}
|
|
1195 elsif ($res eq "VAL") {return "V";}
|
|
1196 elsif ($res eq "LEU") {return "L";}
|
|
1197 elsif ($res eq "ILE") {return "I";}
|
|
1198 elsif ($res eq "MET") {return "M";}
|
|
1199 elsif ($res eq "PHE") {return "F";}
|
|
1200 elsif ($res eq "TYR") {return "Y";}
|
|
1201 elsif ($res eq "TRP") {return "W";}
|
|
1202 elsif ($res eq "ASN") {return "N";}
|
|
1203 elsif ($res eq "ASP") {return "D";}
|
|
1204 elsif ($res eq "GLN") {return "Q";}
|
|
1205 elsif ($res eq "GLU") {return "E";}
|
|
1206 elsif ($res eq "CYS") {return "C";}
|
|
1207 elsif ($res eq "PRO") {return "P";}
|
|
1208 elsif ($res eq "SER") {return "S";}
|
|
1209 elsif ($res eq "THR") {return "T";}
|
|
1210 elsif ($res eq "LYS") {return "K";}
|
|
1211 elsif ($res eq "HIS") {return "H";}
|
|
1212 elsif ($res eq "ARG") {return "R";}
|
|
1213
|
|
1214 # The HETATM selenomethionine is read by MODELLER like a normal MET in both its HETATM_IO=off and on mode!
|
|
1215 elsif ($res eq "MSE") {return "M";} # SELENOMETHIONINE
|
|
1216 elsif ($res eq "ASX") {return "B";}
|
|
1217 elsif ($res eq "GLX") {return "Z";}
|
|
1218 else {return "X";}
|
|
1219
|
|
1220 # The following post-translationally modified residues are ignored by MODELLER in its default SET HETATM_IO=off mode
|
|
1221 # elsif ($res eq "SEC") {return "C";} # SELENOCYSTEINE
|
|
1222 # elsif ($res eq "SEP") {return "S";} # PHOSPHOSERINE
|
|
1223 # elsif ($res eq "TPO") {return "T";} # PHOSPHOTHREONINE
|
|
1224 # elsif ($res eq "TYS") {return "Y";} # SULFONATED TYROSINE
|
|
1225 # elsif ($res eq "KCX") {return "K";} # LYSINE NZ-CARBOXYLIC ACID
|
|
1226 }
|
|
1227
|
|
1228 ##################################################################################
|
|
1229 # Convert one-letter amino acid code into three-letter code
|
|
1230 ##################################################################################
|
|
1231 sub One2ThreeLetter {
|
|
1232 my $res=uc($_[0]);
|
|
1233 if ($res eq "G") {return "GLY";}
|
|
1234 elsif ($res eq "A") {return "ALA";}
|
|
1235 elsif ($res eq "V") {return "VAL";}
|
|
1236 elsif ($res eq "L") {return "LEU";}
|
|
1237 elsif ($res eq "I") {return "ILE";}
|
|
1238 elsif ($res eq "M") {return "MET";}
|
|
1239 elsif ($res eq "F") {return "PHE";}
|
|
1240 elsif ($res eq "Y") {return "TYR";}
|
|
1241 elsif ($res eq "W") {return "TRP";}
|
|
1242 elsif ($res eq "N") {return "ASN";}
|
|
1243 elsif ($res eq "D") {return "ASP";}
|
|
1244 elsif ($res eq "Q") {return "GLN";}
|
|
1245 elsif ($res eq "E") {return "GLU";}
|
|
1246 elsif ($res eq "C") {return "CYS";}
|
|
1247 elsif ($res eq "P") {return "PRO";}
|
|
1248 elsif ($res eq "S") {return "SER";}
|
|
1249 elsif ($res eq "T") {return "THR";}
|
|
1250 elsif ($res eq "K") {return "LYS";}
|
|
1251 elsif ($res eq "H") {return "HIS";}
|
|
1252 elsif ($res eq "R") {return "ARG";}
|
|
1253 elsif ($res eq "U") {return "SEC";}
|
|
1254 elsif ($res eq "B") {return "ASX";}
|
|
1255 elsif ($res eq "Z") {return "GLX";}
|
|
1256 else {return "UNK";}
|
|
1257 }
|
|
1258
|
|
1259 # Find the pdb file with $pdbcode in pdb directory
|
|
1260 sub FindPDBfile() {
|
|
1261
|
|
1262 my $pdbcode=lc($_[0]);
|
|
1263 foreach $pdbdir (@pdbdirs) {
|
|
1264 if (! -e "$pdbdir") {warn("Warning in $program: pdb directory '$pdbdir' does not exist!\n"); next;}
|
|
1265 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";}
|
|
1266 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";}
|
|
1267 else {$pdbfile="$pdbdir/";}
|
|
1268 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";}
|
|
1269 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";}
|
|
1270 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";}
|
|
1271 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";}
|
|
1272 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile.="$pdbcode.pdb";}
|
|
1273 else {next;}
|
|
1274 return $pdbfile;
|
|
1275 }
|
|
1276 printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");
|
|
1277 return "";
|
|
1278 }
|