Mercurial > repos > hammock > hammock
comparison external_tools/darwin/lib/hh/scripts/hhmakemodel.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:b7652b7c97bd | 6:2277dd59b9f9 |
---|---|
1 #! /usr/bin/env perl | |
2 # | |
3 # hhmakemodel.pl | |
4 # Generate a model from an output alignment of hhsearch. | |
5 # Usage: hhmakemodel.pl -i file.out (-ts file.pdb|-al file.al) [-m int|-m name|-m auto] [-pdb pdbdir] | |
6 | |
7 # (C) Johannes Soeding 2012 | |
8 | |
9 # HHsuite version 2.0.16 (January 2013) | |
10 # | |
11 # Reference: | |
12 # Remmert M., Biegert A., Hauser A., and Soding J. | |
13 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
14 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
15 | |
16 # This program is free software: you can redistribute it and/or modify | |
17 # it under the terms of the GNU General Public License as published by | |
18 # the Free Software Foundation, either version 3 of the License, or | |
19 # (at your option) any later version. | |
20 | |
21 # This program is distributed in the hope that it will be useful, | |
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
24 # GNU General Public License for more details. | |
25 | |
26 # You should have received a copy of the GNU General Public License | |
27 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
28 | |
29 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
30 | |
31 use lib $ENV{"HHLIB"}."/scripts"; | |
32 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
33 use strict; | |
34 use Align; | |
35 | |
36 | |
37 $|=1; # force flush after each print | |
38 | |
39 # Default parameters | |
40 our $d=7; # gap opening penalty for Align.pm; more than 2 mismatches - 2 matches ## previously: 1 | |
41 our $e=0.01; # gap extension penatlty for Align.pm; allow to leave large gaps bridging uncrystallized regions ## previously: 0.1 | |
42 our $g=0.1; # endgap penalty for Align.pm; allow to shift SEQRES residues for uncrystallized aas to ends of alignment ## previously: 0.9 | |
43 my $v=2; # 3: DEBUG | |
44 | |
45 my $formatting="CASP"; # CASP or LIVEBENCH | |
46 my $servername="My server"; # | |
47 my $MINRES=30; # minumum number of new query residues required for a hit to be used as additional parent | |
48 my $infile=""; | |
49 my $outfile=""; | |
50 my $outformat="fas"; | |
51 my $pickhits="1 "; # default: build one model from best hit | |
52 my $Pthr=0; | |
53 my $Ethr=0; | |
54 my $Prob=0; | |
55 my $shift=0; # ATTENTION: set to 0 as default! | |
56 my $NLEN=14; # length of the name field in alignments of hhsearch-output | |
57 my $NUMRES=100; # number of residues per line in FASTA, A2M, PIR format | |
58 my $program=$0; # name of perl script | |
59 my $usage=" | |
60 hhmakemodel.pl from HHsuite $VERSION | |
61 From the top hits in an hhsearch output file (hhr), you can | |
62 * generate a MSA (multiple sequence alignment) containing all representative | |
63 template sequences from all selected alignments (options -fas, -a2m, -a3m, -pir) | |
64 * generate several concatenated pairwise alignments in AL format (option -al) | |
65 * generate several concatenated coarse 3D models in PDB format (option -ts) | |
66 In PIR, PDB and AL format, the pdb files are required in order to read the pdb residue numbers | |
67 and ATOM records. | |
68 The PIR formatted file can be used directly as input to the MODELLER homology modelling package. | |
69 Usage: $program [-i] file.hhr [options] | |
70 | |
71 Options: | |
72 -i <file.hhr> results file from hhsearch with hit list and alignments | |
73 -fas <file.fas> write a FASTA-formatted multiple alignment to file.fas | |
74 -a2m <file.a2m> write an A2M-formatted multiple alignment to file.a2m | |
75 -a3m <file.a3m> write an A3M-formatted multiple alignment to file.a3m | |
76 -m <int> [<int> ...] pick hits with specified indices (default='-m 1') | |
77 -p <probability> minimum probability threshold (default=$Pthr) | |
78 -e <E-value> maximum E-value threshold (default=$Ethr) | |
79 -q <query_ali> use the full-length query sequence in the alignment | |
80 (not only the aligned part); | |
81 the query alignment file must be in HHM, FASTA, A2M, | |
82 or A3M format. | |
83 -N use query name from hhr filename (default: use same | |
84 name as in hhr file) | |
85 -first include only first Q or T sequence of each hit in MSA | |
86 -v verbose mode (default=$v) | |
87 | |
88 Options when database matches in hhr file are PDB or SCOP sequences | |
89 -pir <file.pir> write a PIR-formatted multiple alignment to file.pir | |
90 -ts <file.pdb> write the PDB-formatted models based on *pairwise* | |
91 alignments into file.pdb | |
92 -al <file.al> write the AL-formatted *pairwise* alignments into file.al | |
93 -d <pdbdirs> directories containing the pdb files (for PDB, SCOP, or DALI | |
94 sequences) (default=$pdbdir) | |
95 -s <int> shift the residue indices up/down by an integer (default=$shift); | |
96 -CASP formatting for CASP (for -ts, -al options) (default: LIVEBENCH | |
97 formatting) | |
98 Options when query is compared to itself (for repeat detection) | |
99 -conj include also conjugate alignments in MSA (with query and | |
100 template exchanged) | |
101 -conjs include conjugate alignments and sort by ascending diagonal | |
102 value (i.e. i0-j0) | |
103 \n"; | |
104 | |
105 # Options to help extract repeats from self-alignments: | |
106 # 1. default 2. -conj 3. -conj_diag 4. -conj_compact | |
107 # ABCD ABCD ---A ABCD | |
108 # BCD- BCD- --AB BCDA | |
109 # D--- CD-- -ABC CDAB | |
110 # CD-- D--- ABCD DABC | |
111 # ---A BCD- | |
112 # --AB CD-- | |
113 # -ABC D--- | |
114 | |
115 | |
116 # Variable declarations | |
117 my $line; # input line | |
118 my $score=-1; # score of the best model; at the moment: Probability | |
119 my $qname=""; # name of query from hhsearch output file (infile) | |
120 my $tname=""; # name of template (hit) from hhsearch output file (infile) | |
121 my $qnameline=""; # nameline of query | |
122 my $tnameline; # nameline of template | |
123 my $pdbfile; # name of pdbfile to read | |
124 my $pdbcode; # four-letter pdb code in lower case and _A if chain A (e.g. 1h7w_A) | |
125 my $aaq; # query amino acids from hhsearch output | |
126 my @aaq; # query amino acids from hhsearch output | |
127 my @qname; # query names in present alignment as returned from ReadAlignment() | |
128 my @qfirst; # indices of first residues in present alignmet as returned from ReadAlignment() | |
129 my @qlast; # indices of last residues in present alignmet as returned from ReadAlignment() | |
130 my @qseq; # sequences of querys in present alignment as returned from ReadAlignment() | |
131 my @tname; # template names in present alignment as returned from ReadAlignment() | |
132 my @tfirst; # indices of first residues in present alignmet as returned from ReadAlignment() | |
133 my @tlast; # indices of last residues in present alignmet as returned from ReadAlignment() | |
134 my @tseq; # sequences of templates in present alignment as returned from ReadAlignment() | |
135 my $aat; # template amino acids from hhsearch output | |
136 my @aat; # template amino acids from hhsearch output | |
137 my $aapdb; # template amino acids from pdb file | |
138 my @aapdb; # template amino acids from pdb file | |
139 my $qfirst=0; # first residue of query | |
140 my $qlast=0; # last residue of query | |
141 my $qlength; # length of query sequence | |
142 my $tfirst=0; # first residue of template | |
143 my $tlast=0; # first residue of template | |
144 my $tlength; # length of template sequence | |
145 my $l=1; # counts template residues from pdb file (first=1, like for i[col2] and j[col2] | |
146 my $col1=0; # counts columns from hhsearch alignment | |
147 my $col2=0; # counts columns from alignment (by function &AlignNW) of $aat versus $aapdb | |
148 my @i1; # $i1[$col1] = index of query residue in column $col1 of hhsearch-alignment | |
149 my @j1; # $j1[$col1] = index of template residue in column $col1 of hhsearch-alignment | |
150 my @j2; # $j2[$col2] = index of hhsearch template seq in $col2 of alignment against pdb template sequence | |
151 my @l2; # $l2[$col2] = index of pdb template seq in $col2 of alignment against hhsearch template sequence | |
152 my @l1; # $l1[$col1] = $l2[$col2] | |
153 my $res; # residue name | |
154 my $chain; # pdb chain from template name | |
155 my $qfile; # name of query sequence file (for -q option) | |
156 my $qmatch; # number of match states in alignment | |
157 my $hit; # index of hit in hit list | |
158 my $k; # index of hit sorted by position in alignment with query (k=1,...,k=@first-2) | |
159 my %picked=(); # $picked{$hit} is defined and =$k for hits that will be transformed into model | |
160 my @remarks; | |
161 my @printblock; # block 0: header block k: k'th hit | |
162 my $keyword=""; # either METHOD for CASP format or REMARK for LIVEBENCH format | |
163 my $conj=0; # include conjugate sequences? Sort in which way? | |
164 my $conjugate=0; # when query is compared to itself: do not include conjugate alignments | |
165 my $onlyfirst=0; # include only first representative sequence of each Q/T alignment | |
166 my $dummy; # dummy | |
167 my $addchain=1; # 1: PDB files contain chain-id as in 1abc_A.pdb (not 1abc.pdb or pdb1abc.pdb etc.) | |
168 my $pdbdirs=$pdbdir; # default pdb directory with *.pdb files | |
169 my $options=""; | |
170 | |
171 # Processing command line options | |
172 if (@ARGV<1) {die $usage;} | |
173 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} | |
174 | |
175 | |
176 # Set options | |
177 if ($options=~s/ -i\s+(\S+) / /g) {$infile=$1;} | |
178 if ($options=~s/ -q\s+(\S+) / /g) {$qfile=$1;} | |
179 if ($options=~s/ -ts\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";} | |
180 if ($options=~s/ -pdb\s+(\S+) / /ig) {$outfile=$1; $outformat="TS";} | |
181 if ($options=~s/ -al\s+(\S+) / /ig) {$outfile=$1; $outformat="AL";} | |
182 if ($options=~s/ -pir\s+(\S+) / /ig) {$outfile=$1; $outformat="PIR";} | |
183 if ($options=~s/ -fas\s+(\S+) / /ig) {$outfile=$1; $outformat="FASTA";} | |
184 if ($options=~s/ -a2m\s+(\S+) / /ig) {$outfile=$1; $outformat="A2M";} | |
185 if ($options=~s/ -a3m\s+(\S+) / /ig) {$outfile=$1; $outformat="A3M";} | |
186 if ($options=~s/ -p\s+(\S+) / /g) {$Pthr=$1;} | |
187 if ($options=~s/ -e\s+(\S+) / /g) {$Ethr=$1;} | |
188 if ($options=~s/ -s\s+(\S+) / /g) {$shift=$1;} | |
189 if ($options=~s/ -d\s+(([^-\s]\S*\s+)*)/ /g) {$pdbdirs=$1;} | |
190 if ($options=~s/ -m\s+((\d+\s+)+)/ /g) {$pickhits=$1; } | |
191 if ($options=~s/ -first\s+/ /ig) {$onlyfirst=1;} | |
192 | |
193 # Self-alignment options | |
194 if ($options=~s/ -conj\s+/ /ig) {$conj=1;} | |
195 if ($options=~s/ -conjs\s+/ /ig) {$conj=2;} | |
196 | |
197 # Switch formatting and method description | |
198 if ($options=~s/ -CASP\s+/ /ig) {$formatting="CASP";} | |
199 if ($options=~s/ -LIVEBENCH\s+/ /ig) {$formatting="LIVEBENCH";} | |
200 if ($options=~s/ -server\s+(\S+)/ /g) {$servername=$1;} | |
201 | |
202 # Set verbose mode? | |
203 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;} | |
204 elsif ($options=~s/ -v\s+/ /g) {$v=1;} | |
205 | |
206 # Read infile and outfile | |
207 if (!$infile && $options=~s/^\s*([^-]\S+)\s*/ /) {$infile=$1;} | |
208 if (!$outfile && $options=~s/^\s*([^-]\S+)\s*/ /) {$outfile=$1;} | |
209 if ($options=~s/ -N / /ig) { | |
210 $qname=$infile; | |
211 $qname=~s/^.*?([^\/]+)$/$1/; # remove path | |
212 $qname=~s/^(.*)\.[^\.]*$/$1/; # remove extension | |
213 $qnameline=$qname; | |
214 } | |
215 | |
216 # Warn if unknown options found or no infile/outfile | |
217 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");} | |
218 if ($infile eq "") {die("$usage\nError in $program: input file missing: $!\n");} | |
219 if ($outfile eq "") {die("$usage\nError in $program: output file missing: $!\n");} | |
220 | |
221 my @pdbdirs = split(/\s+/,$pdbdirs); | |
222 | |
223 # Find query name in input file | |
224 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n"; | |
225 while ($line=<INFILE>) { | |
226 if ($v>=3) {print("$line");} | |
227 if ($qname eq "" && $line=~/^Query:?\s*(\S+)(.*)/) {$qname=$1; $qnameline=$1.$2;} | |
228 if ($line=~/^Match_columns:?\s*(\S+)/) {$qmatch=$1; last;} | |
229 } | |
230 if (!($line=<INFILE>)) {die ("Error in $program: wrong format in $infile: $!\n");} | |
231 | |
232 | |
233 # Prepare hash %pick with indices of hits that will be transformed into model | |
234 # No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM | |
235 # 1 153l Lysozyme (E.C.3.2.1.17) 100.0 0 0 381.0 19.4 185 1-185 1-185 (185) | |
236 # 2 1qsa_A Soluble lytic transglyc 100.0 2.1E-39 2.5E-43 225.8 8.3 149 21-182 423-600 (618) | |
237 # 3 1ltm 36 kDa soluble lytic tr 95.9 3.5E-06 4.1E-10 50.3 11.0 95 28-122 76-221 (320) | |
238 | |
239 # option '-m m1 m2 m3': pick models manually | |
240 my @pickhits = split(/\s+/,$pickhits); | |
241 $k=1; | |
242 foreach $hit (@pickhits) { | |
243 if (!defined $picked{$hit}) {$picked{$hit}=$k;} | |
244 $k++; | |
245 } | |
246 | |
247 if ($outformat eq "AL" || $outformat eq "TS") { | |
248 &MakePairwiseAlignments(); | |
249 } else { | |
250 &MakeMultipleAlignment(); | |
251 } | |
252 exit; | |
253 | |
254 | |
255 ################################################################################## | |
256 # Construct AL or TS formatted alignment as a list of pairwise alignments | |
257 ################################################################################## | |
258 sub MakePairwiseAlignments() | |
259 { | |
260 # Scan through query-vs-template-alignments from infile and create first (combination) model | |
261 $hit=0; # counts hits in hit list | |
262 my $models=0; | |
263 while ($line=<INFILE>) { | |
264 if ($line=~/^>(\S+)/) { | |
265 $hit++; | |
266 if ($Pthr || $Ethr || defined $picked{$hit}) { | |
267 # Found right alignment (hit) | |
268 if (defined $picked{$hit}) {$k=$picked{$hit};} else {$k=$hit;} | |
269 | |
270 if ($line=~/^>(.*?)\s+E=.*$/) { | |
271 $line=$1; # remove E=1.3E-30 etc. at the end | |
272 } else { | |
273 $line=~/^>(.*)/; | |
274 $line=$1; | |
275 } | |
276 my $nameline=$line; | |
277 my $evalue; | |
278 $line=<INFILE>; | |
279 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) {$score=$1; $evalue=$2} | |
280 else {$score=0; warn("WARNING: could not print score $line");} | |
281 if ($line=~/Aligned_cols=\s*(\S+)/) {;} else {warn("WARNING: could not find aligned_cols\n");} | |
282 if ($Pthr && $score<$Pthr) {last;} # Probability too low -> finished | |
283 if ($Ethr && $evalue>$Ethr) {last;} # Evalue too high > finished | |
284 | |
285 | |
286 # Commented out in CASP format | |
287 if ($formatting eq "LIVEBENCH") { | |
288 $printblock[$k] ="PFRMAT $outformat\n"; | |
289 $printblock[$k].="TARGET $qname\n"; | |
290 } | |
291 | |
292 $remarks[$k]="REMARK $k: $nameline\n"; | |
293 $remarks[$k].="REMARK $line"; | |
294 | |
295 &ReadAlignment(); | |
296 $qfirst = $qfirst[0]; | |
297 $qlast = $qlast[0]; | |
298 $aaq = $qseq[0]; | |
299 $tfirst = $tfirst[0]; | |
300 $aat = $tseq[0]; | |
301 $tname = $tname[0]; | |
302 | |
303 if ($v>=3) { | |
304 for (my $i=0; $i<@qfirst; $i++) { | |
305 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]); | |
306 } | |
307 printf("\n"); | |
308 for (my $i=0; $i<@tfirst; $i++) { | |
309 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]); | |
310 } | |
311 printf("\n"); | |
312 } | |
313 | |
314 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain | |
315 if (&ExtractPdbcodeAndChain($tname[0])) {next;} | |
316 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";} | |
317 | |
318 # Read score (=probability) | |
319 $printblock[$k].="REMARK $nameline\n"; | |
320 $printblock[$k].="REMARK $line"; | |
321 $printblock[$k].="SCORE $score\n"; | |
322 $printblock[$k].="PARENT $pdbcode\n"; | |
323 $printblock[$k].="MODEL $k\n"; | |
324 | |
325 &WritePairwiseAlignments(); | |
326 $printblock[$k].="END\n"; | |
327 $models++; | |
328 | |
329 } | |
330 } | |
331 } | |
332 $k=$#printblock; # set $k to last index in @printblock | |
333 if ($k<0) { | |
334 $printblock[1]="PARENT NONE\nTER\n"; | |
335 $printblock[1].="END\n"; | |
336 if ($v>=1) {print("WARNING: no hits found for model!\n");} | |
337 } | |
338 close (INFILE); | |
339 | |
340 if ($v>=2) { | |
341 printf("$models models built\n"); | |
342 } | |
343 | |
344 | |
345 | |
346 # Write model file header | |
347 | |
348 #---+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8 | |
349 | |
350 # Print header | |
351 my $date = scalar(localtime); | |
352 if ($formatting eq "CASP") { | |
353 $printblock[0]="PFRMAT $outformat\n"; | |
354 $printblock[0].="TARGET $qname\n"; | |
355 } | |
356 $printblock[0].="REMARK AUTHOR $servername\n"; | |
357 $printblock[0].="REMARK $date\n"; | |
358 # $printblock[0].="REMARK J. Soeding \n"; | |
359 | |
360 # Add remarks | |
361 for ($k=0; $k<@remarks; $k++) { | |
362 if (defined $remarks[$k]) { | |
363 $printblock[0].=$remarks[$k]; | |
364 } | |
365 } | |
366 $printblock[0].="REMARK \n"; | |
367 | |
368 | |
369 # Print @printblock into outfile | |
370 | |
371 open (OUTFILE, ">$outfile") || die "Error in $program: Couldn't open $outfile: $!\n"; | |
372 foreach my $printstr (@printblock) { | |
373 my @printarr=split(/\n/,$printstr); | |
374 if ($outformat eq "TS") { | |
375 foreach $printstr (@printarr) { | |
376 printf(OUTFILE "%-80.80s\n",$printstr); | |
377 } | |
378 } else { | |
379 foreach $printstr (@printarr) { | |
380 printf(OUTFILE "%s\n",$printstr); | |
381 } | |
382 } | |
383 } | |
384 close (OUTFILE); | |
385 | |
386 if ($outformat eq "TS") { | |
387 # Call MaxSprout to generate sidechains | |
388 } | |
389 return; | |
390 } | |
391 | |
392 | |
393 ################################################################################## | |
394 # Construct multiple alignment in FASTA, A2M, or PIR format | |
395 ################################################################################## | |
396 sub MakeMultipleAlignment() | |
397 { | |
398 my @hitnames=(); # $hitnames[$k] is the nameline of the ihit'th hit | |
399 my @hitseqs=(); # $hitseqs[$k] contains the residues of the ihit'th hit | |
400 my @hitdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0] | |
401 my @conjnames=(); # $hitnames[$k] is the nameline of the ihit'th conjugate hit | |
402 my @conjseqs=(); # $hitseqs[$k] contains the residues of the ihit'th conjugate hit | |
403 my @conjdiag=(); # $hitdiag[$k] = $qfirst[0]-$tfirst[0] for conjugate alignments | |
404 | |
405 my $new_hit; # residues of new hit | |
406 my $i; # residue index | |
407 my $j; # residue index | |
408 my $k; # sequence index | |
409 | |
410 $hitnames[0]=""; | |
411 $hitseqs[0]=""; | |
412 $hitdiag[0]=0; | |
413 $conjnames[0]=""; | |
414 $conjseqs[0]=""; | |
415 $conjdiag[0]=0; | |
416 | |
417 open (INFILE, "<$infile") || die "Error in $program: Couldn't open $infile: $!\n"; | |
418 $hit=0; # counts hits in hit list | |
419 | |
420 # Read one alignment after the other | |
421 while ($line=<INFILE>) { | |
422 | |
423 # Found new aligment | |
424 if ($line=~/^>(\S+)/) { | |
425 $hit++; | |
426 | |
427 # Is alignment selected by user? | |
428 if ($Pthr || $Ethr || defined $picked{$hit}) { | |
429 | |
430 if ($line=~/^>(\S+)(.*)/) {$tname=$1; $tnameline=$1.$2;} | |
431 else {die("\nError: bad format in $infile, line $.: code 1\n");} | |
432 | |
433 $line = <INFILE>; | |
434 if ($line=~/Probab\s*=\s*(\S+).*E-value\s*=\s*(\S+)/) { | |
435 if ($Pthr && $1<$Pthr) {last;} # Probability too low -> finished | |
436 if ($Ethr && $2>$Ethr) {last;} # Evalue too high > finished | |
437 } else { die("\nError: bad format in $infile, line $.: code 2\n"); } | |
438 | |
439 # Read next alignment with $aaq, $qfirst, @tseq, @first, and @tname | |
440 &ReadAlignment(); | |
441 chomp($tnameline); | |
442 if ($tnameline=~/\S+\s+(.*)/) {$tname[0].=" $1";} # template seed sequence gets its description | |
443 | |
444 # Format sequences into @hitseqs and @hitnames | |
445 &FormatSequences(\@hitnames,\@hitseqs,\@hitdiag,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength); | |
446 | |
447 # Use conjugate alignments? | |
448 if ($conj>0) { | |
449 &FormatSequences(\@conjnames,\@conjseqs,\@conjdiag,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength); | |
450 } | |
451 | |
452 } # end: if ($Pthr>0 || defined $picked{$hit}) | |
453 | |
454 } # end: if ($line=~/^>(\S+)/) # found new alignment | |
455 | |
456 } # end while | |
457 close (INFILE); | |
458 | |
459 # Insert full-length query sequence? | |
460 if ($qfile) { | |
461 $hitseqs[0]=""; | |
462 open (QFILE, "<$qfile") || die "Error in $program: Couldn't open $qfile: $!\n"; | |
463 while ($line=<QFILE>) { | |
464 if ($line=~/^>/ && $line!~/^>ss_/ && $line!~/^>sa_/ && $line!~/^>aa_/ && $line!~/^>Consensus/) {last;} | |
465 } | |
466 while ($line=<QFILE>) { | |
467 if ($line=~/^>/ || $line=~/^\#/) {last;} | |
468 $line=~tr/\n\.-//d; | |
469 $line=~tr/a-z/A-Z/; | |
470 $hitseqs[0].=$line; | |
471 } | |
472 close(QFILE); | |
473 if ($v>=2) {printf("\nQ(full) %-14.14s %s\n",$qname,$hitseqs[0]);} | |
474 } | |
475 | |
476 | |
477 # DEBUG | |
478 if ($v>=3) { | |
479 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]); | |
480 for ($k=1; $k<@hitnames; $k++) { | |
481 printf("T hit %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]); | |
482 } | |
483 printf("\n"); | |
484 printf("\nQuery %-14.14s %s\n",$qname,$conjseqs[0]); | |
485 for ($k=1; $k<@conjnames; $k++) { | |
486 printf("T conj %3i %-14.14s %s\n",$k,$conjnames[$k],$conjseqs[$k]); | |
487 } | |
488 printf("\n"); | |
489 } | |
490 | |
491 | |
492 # Include conjugate sequences? | |
493 if ($conj>0) { | |
494 shift(@conjseqs); # delete zeroth ("query") sequence of @conjseqs | |
495 shift(@conjnames); # | |
496 shift(@conjdiag); # | |
497 | |
498 # Sort by diagonals $hitdiag[], $conjdiag[] | |
499 &Sort(\@hitdiag,\@hitseqs,\@hitnames); | |
500 &Sort(\@conjdiag,\@conjseqs,\@conjnames); | |
501 | |
502 # Append conjugate sequences to hitseqs | |
503 splice(@hitseqs,scalar(@hitseqs),0,@conjseqs); | |
504 splice(@hitnames,scalar(@hitnames),0,@conjnames); | |
505 | |
506 if ($v>=3) { | |
507 printf("\nQuery %-14.14s %s\n",$qname,$hitseqs[0]); | |
508 for ($k=1; $k<@hitnames; $k++) { | |
509 chomp($hitnames[$k]); | |
510 printf("T tot %3i %-14.14s %s\n",$k,$hitnames[$k],$hitseqs[$k]); | |
511 $hitnames[$k].="\n"; | |
512 } | |
513 } | |
514 } | |
515 | |
516 # Insert gaps: | |
517 | |
518 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j. | |
519 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state | |
520 my $insert; | |
521 my $ngap; | |
522 | |
523 # For each match state determine length of LONGEST insert after this match state and store in @len_ins | |
524 for ($k=0; $k<@hitnames; $k++) { | |
525 # split into list of single match states and variable-length inserts | |
526 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements | |
527 # The '#' symbol is prepended to get rid of a perl bug in split | |
528 $j=0; | |
529 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#"); | |
530 # printf("Sequence $k: $hitseqs[$k]\n"); | |
531 # printf("Sequence $k: @inserts\n"); | |
532 foreach $insert (@inserts) { | |
533 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) { | |
534 $len_ins[$j]=length($insert); | |
535 } | |
536 $j++; | |
537 # printf("$insert|"); | |
538 } | |
539 # printf("\n"); | |
540 } | |
541 | |
542 | |
543 | |
544 # After each match state insert residues and fill up with gaps to $len_ins[$i] characters | |
545 for ($k=0; $k<@hitnames; $k++) { | |
546 # split into list of single match states and variable-length inserts | |
547 @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#"); | |
548 $j=0; | |
549 | |
550 # append the missing number of gaps after each match state | |
551 foreach $insert (@inserts) { | |
552 if($outformat eq "FASTA") { | |
553 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.="-";} | |
554 } | |
555 else { | |
556 for ($i=length($insert); $i<$len_ins[$j]; $i++) {$insert.=".";} | |
557 } | |
558 $j++; | |
559 } | |
560 $hitseqs[$k] = join("",@inserts); | |
561 $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end | |
562 } | |
563 | |
564 # Remove columns at beginning and end with gaps in all sequences | |
565 my $remove_start; | |
566 my $remove_end; | |
567 my $len; | |
568 $hitseqs[0]=~/^(-*)/; | |
569 $remove_start=length($1); | |
570 $hitseqs[0]=~/(-*)$/; | |
571 $remove_end=length($1); | |
572 for ($k=0; $k<@hitnames; $k++) { | |
573 $hitseqs[$k]=~s/^.{$remove_start}(.*).{$remove_end}/$1/; | |
574 } | |
575 $len=($hitseqs[0]=~tr/a-zA-Z/a-zA-Z/); | |
576 | |
577 # Prepare name line of query | |
578 if ($outformat eq "PIR") { | |
579 my $qnametmp=$qname; | |
580 $qnametmp=~tr/:/;/; | |
581 $qnameline=~/^\S+\s*(.*)/; | |
582 my $qnamelinetmp=$1; | |
583 $qnamelinetmp=~tr/:/;/; | |
584 $hitnames[0] = sprintf(">P1;%s\nsequence:%s:%4i: :%4i: :%s: : 0.00: 0.00\n",$qnametmp,$qnametmp,$remove_start+1,$len+$remove_start,$qnamelinetmp); | |
585 } else { | |
586 # outformat is "FASTA" or "A2M" or "A3M" or ... | |
587 $hitnames[0] = ">$qnameline\n"; | |
588 } | |
589 | |
590 # If pretty diagonally sorted order is wanted... | |
591 if ($conj>0) { | |
592 if ($conj==2) { | |
593 my $center = 0.5*(scalar(@hitseqs)-1); | |
594 @conjseqs = splice(@hitseqs,$center+1,$center); | |
595 splice(@hitseqs,0,0,@conjseqs); | |
596 @hitseqs = reverse(@hitseqs); | |
597 | |
598 @conjnames = splice(@hitnames,$center+1,$center); | |
599 splice(@hitnames,0,0,@conjnames); | |
600 @hitnames = reverse(@hitnames); | |
601 # Shorten namelines of all but first sequence | |
602 my %count; | |
603 for ($k=0; $k<@hitnames; $k++) { | |
604 if ($k==$center) {$k++;} | |
605 $hitnames[$k]=~/(\S{1,14})/; | |
606 if (!defined $count{$1}) {$count{$1}=0;} | |
607 my $count = ++$count{$1}; | |
608 # printf("vorher: %s ",$hitnames[$k]); | |
609 $hitnames[$k]=~s/^(\S{1,14}).*/$1:$count/; | |
610 # printf("nachher: %s\n",$hitnames[$k]); | |
611 } | |
612 } else { | |
613 for ($k=0; $k<@hitnames; $k++) {$hitnames[$k]=">$qname\n";} | |
614 } | |
615 } | |
616 | |
617 # Remove gaps? Captialize? | |
618 if ($outformat eq "PIR") { | |
619 for ($k=0; $k<@hitnames; $k++) { | |
620 $hitseqs[$k].="*";; # Transform to upper case | |
621 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case | |
622 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions | |
623 } | |
624 } elsif ($outformat eq "FASTA") { | |
625 for ($k=0; $k<@hitnames; $k++) { | |
626 $hitseqs[$k]=~tr/a-z./A-Z-/; # Transform to upper case | |
627 $hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g; # insert newlines every NUMRES positions | |
628 } | |
629 } elsif ($outformat eq "A2M") { | |
630 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~s/(.{1,$NUMRES})/$1\n/g;} # insert newlines every NUMRES positions | |
631 } elsif ($outformat eq "A3M") { | |
632 for ($k=0; $k<@hitnames; $k++) {$hitseqs[$k]=~tr/.//d;$hitseqs[$k].="\n";} # Remove gaps aligned to inserts | |
633 } | |
634 | |
635 # Write sequences into output file | |
636 open (OUTFILE, ">$outfile") || die ("cannot open $outfile:$!"); | |
637 for ($k=0; $k<@hitnames; $k++) { | |
638 print(OUTFILE "$hitnames[$k]$hitseqs[$k]"); | |
639 } | |
640 close OUTFILE; | |
641 | |
642 | |
643 if ($v>=2) { | |
644 printf("%i sequences written to $outfile\n",scalar(@hitnames)); | |
645 } | |
646 } | |
647 | |
648 | |
649 # Format sequences into @hitseqs and @hitnames | |
650 # & Call with FormatSequences(\@hitnames,\@hitseqs,\@qname,\@qseq,\@qfirst,\@qlast,\$qlength,\@tname,\@tseq,\@tfirst,\@tlast,\$tlength); | |
651 sub FormatSequences() | |
652 { | |
653 my $p_hitnames = $_[0]; # typeglob to $hitname | |
654 my $p_hitseqs = $_[1]; # ... | |
655 my $p_hitdiag = $_[2]; # ... | |
656 | |
657 my $p_qname = $_[3]; # | |
658 my $p_qseq = $_[4]; # | |
659 my $p_qfirst = $_[5]; # | |
660 my $p_qlast = $_[6]; # | |
661 my $p_qlength = $_[7]; # | |
662 | |
663 my $p_tname = $_[8]; # | |
664 my $p_tseq = $_[9]; # | |
665 my $p_tfirst = $_[10]; # | |
666 my $p_tlast = $_[11]; # | |
667 my $p_tlength = $_[12]; # | |
668 my $i; | |
669 | |
670 if ($v>=2) { | |
671 if (defined $picked{$hit}) { | |
672 print("hit=$hit picked=$picked{$hit} tname=$tname[0]"); | |
673 } else { | |
674 print("hit=$hit picked=evalue<$Ethr tname=$tname[0]"); | |
675 } | |
676 for (my $i=1; $i<@{$p_tname}; $i++) { | |
677 print(", $tname[$i]"); | |
678 } | |
679 print("\n"); | |
680 } | |
681 | |
682 | |
683 my $qfirst = ${$p_qfirst}[0]; | |
684 my $qlast = ${$p_qlast}[0]; | |
685 my $qlength = ${$p_qlength}; | |
686 my $aaq = ${$p_qseq}[0]; | |
687 | |
688 @aaq = unpack("C*",$aaq); # needed for transforming template sequences into a3m based on query residues (NOT HMM match states!) | |
689 $aaq=~tr/.-//d; # remove all gaps from query sequence | |
690 | |
691 # For all template sequences in the present alignment | |
692 for (my $k=0; $k<@{$p_tname}; $k++) { | |
693 | |
694 $tname =${$p_tname}[$k]; | |
695 $tfirst=${$p_tfirst}[$k]; | |
696 $aat =${$p_tseq}[$k]; | |
697 | |
698 # Transform template residues into a3m format: | |
699 # match states are those where query has residue (NOT where HMM has match state!) | |
700 # This makes sense since we want to build a model for the query sequence. | |
701 @aat = unpack("C*",$aat); | |
702 $aat=""; | |
703 | |
704 # Transform all columns with residue in query into match/delete states, all others to inserts | |
705 for ($i=0; $i<scalar(@aaq); $i++) { | |
706 if ($aaq[$i]!=45 && $aaq[$i]!=46) { # no gap in query | |
707 if($aat[$i]==46) { | |
708 $aat.="-"; # transform '.' to '-' if aligned with a query residue | |
709 } else { | |
710 $aat .= uc(chr($aat[$i])); # UPPER case if aligned with a query residue (match state) | |
711 } | |
712 } else { | |
713 if($aat[$i]!=45 && $aat[$i]!=46) { # no gap in template? | |
714 $aat.=lc(chr($aat[$i])); # lower case if aligned with a gap in the query (insert state) | |
715 } | |
716 } | |
717 } | |
718 | |
719 if ($v>=2) { | |
720 printf("\nQ %-14.14s %s\n",$qname,$aaq); | |
721 printf("T %-14.14s %s\n",$tname,$aat); | |
722 } | |
723 | |
724 # Outformat is PIR? => read residues and indices from PDB ATOM records | |
725 if ($outformat eq "PIR") { | |
726 | |
727 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain | |
728 if (&ExtractPdbcodeAndChain($tname)) {next;} | |
729 | |
730 # Read sequence from pdb file | |
731 if (!open (PDBFILE, "$pdbfile")) { | |
732 die ("Error in $program: Couldn't open $pdbfile: $!\n"); | |
733 } | |
734 $aapdb=""; | |
735 $l=0; | |
736 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l] | |
737 my $nres=-1e6; | |
738 my $resolution=-1.00; | |
739 my $rvalue=-1.00; | |
740 while ($line=<PDBFILE>) { | |
741 if ($line=~/^REMARK.*RESOLUTION\.\s+(\d+\.?\d*)/) {$resolution=$1;} | |
742 if ($line=~/^REMARK.*R VALUE\s+\(WORKING SET\)\s+:\s+(\d+\.?\d*)/) {$rvalue=$1;} | |
743 if ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one | |
744 if (($line=~/^ATOM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ || | |
745 ($line=~/^HETATM\s+\d+ .. [ A](\w{3}) $chain\s*(-?\d+.)/ && &Three2OneLetter($1) ne "X") ) && | |
746 $2 ne $nres ) { | |
747 $res=$1; | |
748 $nres=$2; | |
749 $nres[$l]=$2; | |
750 $res=&Three2OneLetter($res); | |
751 $aapdb[$l++]=$res; | |
752 $aapdb.=$res; | |
753 } | |
754 } | |
755 close (PDBFILE); | |
756 if (length($aapdb)<=0) {die("Error: chain $chain not found in pdb file $pdbfile\n");} | |
757 | |
758 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb) | |
759 my $xseq=$aat; | |
760 $xseq=~tr/-/~/; # transform Deletes to '~' to distinguish them from gaps '-' inserted by Align.pm | |
761 my $yseq=$aapdb; | |
762 my ($jmin,$jmax,$lmin,$lmax); | |
763 my $Sstr; | |
764 my $score; | |
765 # The aligned characters are returend in $j2[$col2] and $l2[$col2] | |
766 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr); | |
767 | |
768 # DEBUG | |
769 if ($v>=3) { | |
770 printf("Template (hh) $xseq\n"); | |
771 printf("Identities $Sstr\n"); | |
772 printf("Template (pdb) $yseq\n"); | |
773 printf("\n"); | |
774 if ($v>=4) { | |
775 for ($col2=0; $col2<@l2 && $col2<1000; $col2++) { | |
776 printf("%3i %3i:%s %3i:%s -> %i\n",$col2,$j2[$col2],substr($aat,$j2[$col2]-1,1),$l2[$col2],substr($aapdb,$l2[$col2]-1,1),$nres[$l2[$col2]-1]); | |
777 } | |
778 } | |
779 } | |
780 | |
781 # check for reasonable alignment | |
782 my $num_match = 0; | |
783 for ($i=0; $i<@l2; $i++) { | |
784 if ($j2[$i] > 0 && $l2[$i] > 0) { | |
785 $num_match++; | |
786 } | |
787 } | |
788 if (($score/$num_match) < 1) { | |
789 print "WARNING! Match score with PDBfile (score: $score num: $num_match score/num:".($score/$num_match).") to low => $pdbfile not included!\n"; | |
790 next; | |
791 } | |
792 | |
793 # Assign a3m-formatted amino acid sequence from pdb file to $aapdb | |
794 $aapdb=""; | |
795 my @xseq=unpack("C*",$xseq); | |
796 my @yseq=unpack("C*",$yseq); | |
797 for ($i=0; $i<@yseq; $i++) { | |
798 if(($xseq[$i]>=65 && $xseq[$i]<=90) || $xseq[$i]==ord('~')) { # if $aat has upper case residue or Delete state | |
799 # Match state | |
800 $aapdb.=uc(chr($yseq[$i])); | |
801 } else { | |
802 # Insert state | |
803 if ($yseq[$i]!=45) {$aapdb.=lc(chr($yseq[$i]));} # add only if not a gap '-' | |
804 } | |
805 } | |
806 | |
807 # Remove overlapping ends of $aapdb | |
808 $aapdb=~s/^[a-z]*(.*?)[a-z]*$/$1/; | |
809 | |
810 # Glue gaps at beginning and end of aligned pdb sequence and add sequence to alignment | |
811 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aapdb.("-" x ($qlength-$qlast)) ); # use ATOM record residues $aapdb! | |
812 | |
813 # Write hitname in PIR format into @hitnames | |
814 my $descr; | |
815 my $organism; | |
816 my $struc=$pdbcode; | |
817 if ($tnameline=~/^(\S+)\s+(.*)/) {$descr=$2; $descr=~tr/://d;} else {$descr=" ";} | |
818 if ($tnameline=~/^(\S+)\s+.*\s+\{(.*)\}/) {$organism=$2;} else {$organism=" ";} | |
819 if (length($chain)>1 || $chain eq ".") { # MODELLER's special symbol for 'chain unspecified' | |
820 $chain="."; | |
821 } elsif ($addchain && $chain ne " ") { | |
822 $struc.="_$chain"; | |
823 } | |
824 # push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s:%4i:%1s:%4i:%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$nres[$lmin-1],$chain,$nres[$lmax-1],$chain,$descr,$organism,$resolution,$rvalue) ); | |
825 | |
826 my $descrtmp=$descr; | |
827 $descrtmp=~tr/:/;/; | |
828 $organism=~tr/://d; | |
829 push (@{$p_hitnames}, sprintf(">P1;%s\nstructureX:%4s: :%1s: :%1s:%s:%s:%-.2f:%-.2f\n",$struc,$struc,$chain,$chain,$descrtmp,$organism,$resolution,$rvalue) ); | |
830 push (@{$p_hitdiag}, $tfirst-$qfirst); | |
831 } else { | |
832 # outformat is "FASTA" or "A2M" or "A3M" or ... | |
833 # Write hitname in FASTA format into @hitnames | |
834 push (@{$p_hitseqs}, ("-" x ($qfirst-1)).$aat.("-" x ($qlength-$qlast)) ); | |
835 push (@{$p_hitnames}, ">$tname\n" ); | |
836 push (@{$p_hitdiag}, $tfirst-$qfirst); | |
837 } | |
838 | |
839 if ($onlyfirst>0) {last;} # extract only first (seed?) sequence in each alignment | |
840 | |
841 } # end: for (my $k=0; $k<@{$tname}; $k++) | |
842 | |
843 # Paste aligned subsequence of query over $hitseqs[0] | |
844 if (${$p_hitseqs}[0] eq "") {${$p_hitseqs}[0] = "-" x $qlength;} | |
845 if (!$qfile) {substr(${$p_hitseqs}[0],$qfirst-1,length($aaq),$aaq);} | |
846 | |
847 return; | |
848 } | |
849 | |
850 | |
851 ################################################################################## | |
852 # Read Alignment from infile (*.hhr file) | |
853 # Results: | |
854 # $aaq: query residues in present alignment | |
855 # $qfirst: index of first query residue in present alignment | |
856 # @tname: template names in present alignmen | |
857 # @tfirst: indices of first residues in present alignmet | |
858 # @tseq: sequences of templates in present alignment | |
859 ################################################################################## | |
860 sub ReadAlignment() { | |
861 | |
862 @qname=(); # name of $it'th query in this alignment | |
863 @qfirst=(); # index of first residue in $it'th query in this alignment | |
864 @qlast=(); # index of last residue in $it'th query in this alignment | |
865 @qseq=(); # residues of $it'th query in this alignment | |
866 | |
867 @tname=(); # name of $it'th template in this alignment | |
868 @tfirst=(); # index of first residue in $it'th template in this alignment | |
869 @tlast=(); # index of last residue in $it'th template in this alignment | |
870 @tseq=(); # residues of $it'th template in this alignment | |
871 | |
872 if ($v>=3) {printf("Searching for Q $qname vs T $tname\n");} | |
873 $line=<INFILE>; | |
874 | |
875 # Search for first line beginning with Q ot T and not followed by aa_, ss_pred, ss_conf, or Consensus | |
876 while (1) { | |
877 my $i; # index for query sequence in this alignment | |
878 # Scan up to first line starting with Q; stop when line 'No\s+\d+' or 'Done' is found | |
879 while (defined $line && $line!~/^Q\s(\S+)/) { | |
880 if ($line=~/^No\s+\d/ || $line=~/^Done/) {last;} | |
881 $line=<INFILE>; next; | |
882 } | |
883 if (!defined $line || $line=~/^No\s+\d/ || $line=~/^Done/) {last;} | |
884 | |
885 # Scan up to first line that is not secondary structure line or consensus line | |
886 while (defined $line && $line=~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;} | |
887 | |
888 # Read next block of query sequences | |
889 $i=0; | |
890 while ($line=~/^Q\s+/) { | |
891 if ($line!~/^Q\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^Q\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/) { | |
892 $qname[$i]=$1; | |
893 if (!$qfirst[$i]) {$qfirst[$i]=$2;} # if $qfirst is undefined then this is the first alignment block -> set $qfirst to $1 | |
894 if (!$qseq[$i]) {$qseq[$i]=$3;} else {$qseq[$i].=$3;} | |
895 $qlast[$i]=$4; | |
896 if ($i==0) {$qlength=$5} | |
897 $i++; | |
898 } | |
899 $line=<INFILE>; | |
900 } | |
901 if ($i==0) { | |
902 die("\nError in $program: bad format in $infile, line $.: query block\n"); | |
903 } | |
904 | |
905 # Scan up to first line starting with T | |
906 while (defined $line && $line!~/^T\s+(\S+)/) {$line=<INFILE>;} | |
907 | |
908 # Scan up to first line that is not secondary structure line or consensus line | |
909 while (defined $line && $line=~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/) {$line=<INFILE>;} | |
910 | |
911 # Read next block of template sequences | |
912 $i=0; | |
913 while ($line=~/^T\s+/) { | |
914 if ($line!~/^T\s+(ss_|sa_|aa_|Consens|Cons-)/ && $line=~/^T\s*(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\((\d+)/){ | |
915 $tname[$i]=$1; | |
916 if (!$tfirst[$i]) {$tfirst[$i]=$2;} # if $tfirst is undefined then this is the first alignment block -> set $tfirst to $1 | |
917 if (!$tseq[$i]) {$tseq[$i]=$3;} else {$tseq[$i].=$3;} | |
918 $tlast[$i]=$4; | |
919 if ($i==0) {$tlength=$5} | |
920 $i++; | |
921 } | |
922 $line=<INFILE>; | |
923 } | |
924 if ($i==0) { | |
925 die("\nError in $program: bad format in $infile, line $.: template block\n"); | |
926 } | |
927 | |
928 } # end while ($line=<INFILE>) | |
929 | |
930 # if (!$qfirst) {$qfirst=1;} # if still $qfirst==0 then set $qfirst to 1 | |
931 # for (my $i=0; $i<@tfirst; $i++) { | |
932 # if (!$tfirst[$i]) {$tfirst[$i]=1;} # if still $tfirst[$i]==0 then set $tfirst to 1 | |
933 # } | |
934 | |
935 # Check lengths | |
936 if (length($qseq[0])!=length($tseq[0])) { | |
937 print("\nError: query and template lines do not have the same length in $infile, line $.\n"); | |
938 for (my $i=0; $i<@qfirst; $i++) { | |
939 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]); | |
940 } | |
941 printf("\n"); | |
942 for (my $i=0; $i<@tfirst; $i++) { | |
943 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]); | |
944 } | |
945 printf("\n"); | |
946 exit 1; | |
947 } | |
948 | |
949 if ($v>=3) { | |
950 for (my $i=0; $i<@qfirst; $i++) { | |
951 printf("Q %-14.14s %s\n",$qname[$i],$qseq[$i]); | |
952 } | |
953 printf("\n"); | |
954 for (my $i=0; $i<@tfirst; $i++) { | |
955 printf("T %-14.14s %s\n",$tname[$i],$tseq[$i]); | |
956 } | |
957 printf("\n"); | |
958 } | |
959 return; | |
960 } | |
961 | |
962 | |
963 ################################################################################## | |
964 # Write Alignment to $printblock[$k] | |
965 ################################################################################## | |
966 sub WritePairwiseAlignments() { | |
967 | |
968 #Delete columns with gaps in both sequences | |
969 $aaq=uc($aaq); | |
970 $aat=uc($aat); | |
971 @aaq=split(//,$aaq); | |
972 @aat=split(//,$aat); | |
973 my $col=0; | |
974 for ($col1=0; $col1<@aaq; $col1++) { | |
975 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/ || $aat[$col1]=~tr/a-zA-Z/a-zA-Z/) { | |
976 $aaq[$col]=$aaq[$col1]; | |
977 $aat[$col]=$aat[$col1]; | |
978 $col++; | |
979 } | |
980 } | |
981 splice(@aaq,$col); # delete end of @aaq; | |
982 splice(@aat,$col); | |
983 $aaq=join("",@aaq); | |
984 $aat=join("",@aat); | |
985 | |
986 | |
987 # Count query and template residues into @i1 and @j1 | |
988 for ($col1=0; $col1<@aaq; $col1++) { | |
989 if ($aaq[$col1]=~tr/a-zA-Z/a-zA-Z/) { | |
990 $i1[$col1]=$qfirst++; #found query residue in $col1 | |
991 } else { | |
992 $i1[$col1]=0; #found gap in $col1 | |
993 } | |
994 if ($aat[$col1]=~tr/a-zA-Z/a-zA-Z/) { | |
995 $j1[$col1]=$tfirst++; #found template residue in $col1 | |
996 } else { | |
997 $j1[$col1]=0; #found gap in $col1 | |
998 } | |
999 } | |
1000 | |
1001 | |
1002 # DEBUG | |
1003 if ($v>=3) { | |
1004 printf ("col Q i1 T j1\n"); | |
1005 for ($col1=0; $col1<@aaq; $col1++) { | |
1006 printf ("%3i %s %3i %s %3i\n",$col1,$aaq[$col1],$i1[$col1],$aat[$col1],$j1[$col1]); | |
1007 } | |
1008 printf ("\n"); | |
1009 } | |
1010 | |
1011 | |
1012 # Read protein chain from pdb file | |
1013 # ----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8 | |
1014 # ATOM 1 N SER A 27 38.637 79.034 59.693 1.00 79.70 # ATOM 2083 CD1 LEU A 22S 15.343 -12.020 43.761 1.00 5.00 C | |
1015 | |
1016 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain | |
1017 if (&ExtractPdbcodeAndChain($tname)) {next;} | |
1018 | |
1019 # Read sequence from pdb file | |
1020 if (! defined $pdbfile) {die ("Error in $program: Couldn't find pdb code in $tname\n");} | |
1021 open (PDBFILE, "$pdbfile") || die ("Error in $program: Couldn't open $pdbfile: $!\n"); | |
1022 if ($chain eq "[A ]") {$pdbcode.="_A";} elsif ($chain eq ".") {;} else {$pdbcode.="_$chain";} | |
1023 $aapdb=""; $l=1; | |
1024 $line=<PDBFILE>; | |
1025 while ($line) {if ($line=~/^ATOM/) {last;} $line=<PDBFILE>;} # advance to ATOM records | |
1026 my @nres; # $nres[$l] = pdb residue index for residue $aapdb[$l] | |
1027 my @coord; # $coord[$l] = coordinates of CA atom of residue $aapdb[$l] | |
1028 while ($line) { | |
1029 if ($line=~/^ATOM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ || | |
1030 ($line=~/^HETATM\s+\d+ CA [ AB](\w{3}) $chain\s*(-?\d+.) (\s*\S+\s+\S+\s+\S+)/ && &Three2OneLetter($1) ne "X") ) { | |
1031 $res=$1; | |
1032 $nres[$l]=$2; | |
1033 $coord[$l]=$3." 1.00"; | |
1034 $res=&Three2OneLetter($res); | |
1035 $aapdb[$l]=$res; | |
1036 $aapdb.=$res; | |
1037 $l++; | |
1038 } | |
1039 elsif ($l>10 && $line=~/^ATOM\s+\d+ CA/) {last;} | |
1040 elsif ($line=~/^ENDMDL/) {last;} # if file contains NMR models read only first one | |
1041 $line=<PDBFILE>; | |
1042 } | |
1043 close (PDBFILE); | |
1044 | |
1045 # Align template in hh-alignment ($aat) with template sequence in pdb ($aapdb) | |
1046 | |
1047 my $xseq=$aat; | |
1048 my $yseq=$aapdb; | |
1049 my ($jmin,$jmax,$lmin,$lmax); | |
1050 my $Sstr; | |
1051 my $score; | |
1052 $xseq=~tr/-/~/d; # transform Deletes to '~' to distinguish them from gaps inserted by Align.pm | |
1053 #the aligned characters are returend in $j2[$col2] and $l2[$col2] | |
1054 | |
1055 if ($v>=3) { | |
1056 printf("Template (hh) $xseq\n"); | |
1057 printf("Identities $Sstr\n"); | |
1058 printf("Template (pdb) $yseq\n"); | |
1059 printf("\n"); | |
1060 } | |
1061 | |
1062 $score=&AlignNW(\$xseq,\$yseq,\@j2,\@l2,\$jmin,\$jmax,\$lmin,\$lmax,\$Sstr); | |
1063 | |
1064 # DEBUG | |
1065 if ($v>=3) { | |
1066 printf("Template (hh) $xseq\n"); | |
1067 printf("Identities $Sstr\n"); | |
1068 printf("Template (pdb) $yseq\n"); | |
1069 printf("\n"); | |
1070 if ($v>=4) { | |
1071 for ($col2=0; $col2<@l2 && $col2<200; $col2++) { | |
1072 printf("%3i %3i %3i\n",$col2,$j2[$col2],$l2[$col2]); | |
1073 } | |
1074 } | |
1075 } | |
1076 | |
1077 # DEBUG | |
1078 | |
1079 # Construct alignment of $aaq <-> $aapdb via alignments $aaq <-> $aat and $aat <-> $aapdb: | |
1080 # Find $l1[$col1] = line of pdb file corresponding to residue $aat[$col1] and $aaq[$col1] | |
1081 $col2=0; | |
1082 for ($col1=0; $col1<@aaq; $col1++) { | |
1083 if ($j1[$col1]==0 || $i1[$col1]==0) {$l1[$col1]=0; next;} # skip gaps in query and gaps in template | |
1084 while ($j2[$col2]<$col1+1) {$col2++;} # in $j2[col2] first index is 1, in $col1 first column is 0 | |
1085 $l1[$col1] = $l2[$col2]; | |
1086 if ($v>=4) {printf("l1[%i]=%i l2[%i]=%i\n",$col1,$l1[$col1],$col2,$l2[$col2]);} | |
1087 } | |
1088 | |
1089 | |
1090 if ($pdbcode ne "NONE") { | |
1091 if ($outformat eq "TS") { | |
1092 for ($col1=0; $col1<@aat; $col1++) { | |
1093 if ($i1[$col1]==0) {next;} # skip gaps in query | |
1094 if ($j1[$col1]==0) {next;} # skip gaps in template sequence | |
1095 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file | |
1096 | |
1097 $printblock[$k].=sprintf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]); | |
1098 if ($v>=4) { | |
1099 printf("ATOM %5i CA %3s %4i %-50.50s\n",$i1[$col1],&One2ThreeLetter($aaq[$col1]),$i1[$col1]+$shift,$coord[$l1[$col1]]); | |
1100 } | |
1101 } | |
1102 } else { | |
1103 for ($col1=0; $col1<@aat; $col1++) { | |
1104 if ($i1[$col1]==0) {next;} # skip gaps in query | |
1105 if ($j1[$col1]==0) {next;} # skip gaps in template sequence | |
1106 if ($l1[$col1]==0) {next;} # skip if corresponding residue was skipped in pdb file | |
1107 $printblock[$k].=sprintf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]); | |
1108 if ($v>=4) {printf("%1s %3i %1s %s\n",$aaq[$col1],$i1[$col1],$aat[$col1],$nres[$l1[$col1]]);} | |
1109 } | |
1110 } | |
1111 } | |
1112 $printblock[$k].=sprintf("TER\n"); | |
1113 return; | |
1114 } | |
1115 | |
1116 | |
1117 | |
1118 # Extract pdbcode and construct name of pdbfile and return in global variables $pdbid and $chain | |
1119 sub ExtractPdbcodeAndChain() | |
1120 { | |
1121 my $name=$_[0]; | |
1122 $name=~/^(\S+)/; | |
1123 $name=$1; | |
1124 | |
1125 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1) | |
1126 if ($name=~/^[defgh](\d[a-z0-9]{3})([a-z0-9_.])[a-z0-9_]$/) { | |
1127 $pdbcode=$1; | |
1128 if ($2 eq "_") {$chain="[A ]";} else {$chain=uc($2);} | |
1129 } | |
1130 | |
1131 # PDB ID? (8fab, 1a0i) | |
1132 elsif ($name=~/^(\d[a-z0-9]{3})$/) { | |
1133 $pdbcode=$1; | |
1134 $chain="[A ]"; | |
1135 } | |
1136 | |
1137 # PDB ID? (8fab_A) | |
1138 elsif ($name=~/^(\d[a-z0-9]{3})_(\S)$/) { | |
1139 $pdbcode=$1; | |
1140 $chain=$2; | |
1141 } | |
1142 | |
1143 # PDB ID? (1u1z_ABC) | |
1144 elsif ($name=~/^(\d[a-z0-9]{3})_(\S\S+)$/) { | |
1145 $pdbcode=$1; | |
1146 $chain="[$2]"; | |
1147 } | |
1148 | |
1149 # DALI ID? (8fabA_0,1a0i_2) | |
1150 elsif ($name=~/^(\d[a-z0-9]{3})([A-Za-z0-9]?)_\d+$/) { | |
1151 $pdbcode=$1; | |
1152 $chain=$2; | |
1153 } | |
1154 | |
1155 else { | |
1156 $pdbcode=$name; | |
1157 $chain="A"; | |
1158 # return 1; # no SCOP/DALI/pdb sequence | |
1159 } | |
1160 | |
1161 &FindPDBfile($pdbcode); | |
1162 | |
1163 if ($pdbfile eq "") { | |
1164 if ($v>=2) {print("Warning: no pdb file found for sequence name '$name'\n");} | |
1165 return 1; | |
1166 } | |
1167 return 0; | |
1168 } | |
1169 | |
1170 | |
1171 # Resort arrays according to sorting array0: | |
1172 # Resort(\@array0,\@array1,...,\@arrayN) | |
1173 sub Sort() | |
1174 { | |
1175 my $p_array0 = $_[0]; | |
1176 my @index=(); | |
1177 for (my $i=0; $i<@{$p_array0}; $i++) {$index[$i]=$i;} | |
1178 @index = sort { ${$p_array0}[$a] <=> ${$p_array0}[$b] } @index; | |
1179 foreach my $p_array (@_) { | |
1180 my @dummy = @{$p_array}; | |
1181 @{$p_array}=(); | |
1182 foreach my $i (@index) { | |
1183 push(@{$p_array}, $dummy[$i]); | |
1184 } | |
1185 } | |
1186 } | |
1187 | |
1188 ################################################################################## | |
1189 # Convert three-letter amino acid code into one-letter code | |
1190 ################################################################################## | |
1191 sub Three2OneLetter { | |
1192 my $res=uc($_[0]); | |
1193 if ($res eq "GLY") {return "G";} | |
1194 elsif ($res eq "ALA") {return "A";} | |
1195 elsif ($res eq "VAL") {return "V";} | |
1196 elsif ($res eq "LEU") {return "L";} | |
1197 elsif ($res eq "ILE") {return "I";} | |
1198 elsif ($res eq "MET") {return "M";} | |
1199 elsif ($res eq "PHE") {return "F";} | |
1200 elsif ($res eq "TYR") {return "Y";} | |
1201 elsif ($res eq "TRP") {return "W";} | |
1202 elsif ($res eq "ASN") {return "N";} | |
1203 elsif ($res eq "ASP") {return "D";} | |
1204 elsif ($res eq "GLN") {return "Q";} | |
1205 elsif ($res eq "GLU") {return "E";} | |
1206 elsif ($res eq "CYS") {return "C";} | |
1207 elsif ($res eq "PRO") {return "P";} | |
1208 elsif ($res eq "SER") {return "S";} | |
1209 elsif ($res eq "THR") {return "T";} | |
1210 elsif ($res eq "LYS") {return "K";} | |
1211 elsif ($res eq "HIS") {return "H";} | |
1212 elsif ($res eq "ARG") {return "R";} | |
1213 | |
1214 # The HETATM selenomethionine is read by MODELLER like a normal MET in both its HETATM_IO=off and on mode! | |
1215 elsif ($res eq "MSE") {return "M";} # SELENOMETHIONINE | |
1216 elsif ($res eq "ASX") {return "B";} | |
1217 elsif ($res eq "GLX") {return "Z";} | |
1218 else {return "X";} | |
1219 | |
1220 # The following post-translationally modified residues are ignored by MODELLER in its default SET HETATM_IO=off mode | |
1221 # elsif ($res eq "SEC") {return "C";} # SELENOCYSTEINE | |
1222 # elsif ($res eq "SEP") {return "S";} # PHOSPHOSERINE | |
1223 # elsif ($res eq "TPO") {return "T";} # PHOSPHOTHREONINE | |
1224 # elsif ($res eq "TYS") {return "Y";} # SULFONATED TYROSINE | |
1225 # elsif ($res eq "KCX") {return "K";} # LYSINE NZ-CARBOXYLIC ACID | |
1226 } | |
1227 | |
1228 ################################################################################## | |
1229 # Convert one-letter amino acid code into three-letter code | |
1230 ################################################################################## | |
1231 sub One2ThreeLetter { | |
1232 my $res=uc($_[0]); | |
1233 if ($res eq "G") {return "GLY";} | |
1234 elsif ($res eq "A") {return "ALA";} | |
1235 elsif ($res eq "V") {return "VAL";} | |
1236 elsif ($res eq "L") {return "LEU";} | |
1237 elsif ($res eq "I") {return "ILE";} | |
1238 elsif ($res eq "M") {return "MET";} | |
1239 elsif ($res eq "F") {return "PHE";} | |
1240 elsif ($res eq "Y") {return "TYR";} | |
1241 elsif ($res eq "W") {return "TRP";} | |
1242 elsif ($res eq "N") {return "ASN";} | |
1243 elsif ($res eq "D") {return "ASP";} | |
1244 elsif ($res eq "Q") {return "GLN";} | |
1245 elsif ($res eq "E") {return "GLU";} | |
1246 elsif ($res eq "C") {return "CYS";} | |
1247 elsif ($res eq "P") {return "PRO";} | |
1248 elsif ($res eq "S") {return "SER";} | |
1249 elsif ($res eq "T") {return "THR";} | |
1250 elsif ($res eq "K") {return "LYS";} | |
1251 elsif ($res eq "H") {return "HIS";} | |
1252 elsif ($res eq "R") {return "ARG";} | |
1253 elsif ($res eq "U") {return "SEC";} | |
1254 elsif ($res eq "B") {return "ASX";} | |
1255 elsif ($res eq "Z") {return "GLX";} | |
1256 else {return "UNK";} | |
1257 } | |
1258 | |
1259 # Find the pdb file with $pdbcode in pdb directory | |
1260 sub FindPDBfile() { | |
1261 | |
1262 my $pdbcode=lc($_[0]); | |
1263 foreach $pdbdir (@pdbdirs) { | |
1264 if (! -e "$pdbdir") {warn("Warning in $program: pdb directory '$pdbdir' does not exist!\n"); next;} | |
1265 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";} | |
1266 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";} | |
1267 else {$pdbfile="$pdbdir/";} | |
1268 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";} | |
1269 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";} | |
1270 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";} | |
1271 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";} | |
1272 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile.="$pdbcode.pdb";} | |
1273 else {next;} | |
1274 return $pdbfile; | |
1275 } | |
1276 printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n"); | |
1277 return ""; | |
1278 } |