annotate external_tools/darwin/lib/hh/scripts/addss.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
1 #!/usr/bin/env perl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
2 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
3 # addss.pl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
4 # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
5 # Output format is A3M (for input alignments) or HMMER (see User Guide).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
6
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
7
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
8 # HHsuite version 2.0.16 (January 2013)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
9 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
10 # Reference:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
11 # Remmert M., Biegert A., Hauser A., and Soding J.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
12 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
13 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
14
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
15 # (C) Johannes Soeding and Michael Remmert, 2012
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
16
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
17 # This program is free software: you can redistribute it and/or modify
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
18 # it under the terms of the GNU General Public License as published by
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
19 # the Free Software Foundation, either version 3 of the License, or
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
20 # (at your option) any later version.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
21
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
22 # This program is distributed in the hope that it will be useful,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
25 # GNU General Public License for more details.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
26
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
27 # You should have received a copy of the GNU General Public License
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
28 # along with this program. If not, see <http://www.gnu.org/licenses/>.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
29
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
30 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
31
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
32 use lib $ENV{"HHLIB"}."/scripts";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
33 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
34 use Align; # Needleman-Wunsch and Smith-Waterman alignment functions
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
35 use File::Temp qw/ tempfile tempdir /;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
36 use strict;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
37
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
38 my $ss_cit="PSIPRED: Jones DT. (1999) Protein secondary structure prediction based on position-specific scoring matrices. JMB 292:195-202.";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
39
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
40
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
41 # Module needed for aligning DSSP-sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
42
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
43 $|= 1; # Activate autoflushing on STDOUT
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
44
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
45 # Default values:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
46 our $v=2; # verbose mode
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
47
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
48 my $numres=0; # number of residues per line for secondary structure
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
49 my $informat="a3m"; # input format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
50 my $neff = 7; # use alignment with this diversity for PSIPRED prediction
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
51 my $program=$0; # name of perl script
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
52 my $pdbfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
53
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
54 my $help="
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
55 addss.pl from HHsuite $VERSION
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
56 Add PSIPRED secondary structure prediction (and DSSP annotation) to a multiple sequence alignment (MSA)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
57 or HMMER (multi-)model file.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
58
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
59 If the input file is an MSA, the predicted secondary structure and confidence values are added as
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
60 special annotation sequences with names >ss_pred, >ss_conf, and >ss_dssp to the top of the output
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
61 A3M alignment. If no output file is given, the output file will have the same name as the input file,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
62 except for the extension being replaced by '.a3m'. Allowed input formats are A3M (default),
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
63 A2M/FASTA (-fas, -a2m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
64
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
65 If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
66 structure and confidence values are added to each model. In this case the output file name is
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
67 obligatory and must be different from the input file name.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
68
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
69 Usage: perl addss.pl <ali_file> [<outfile>] [-fas|-a3m|-clu|-sto]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
70 or perl addss.pl <hhm_file> <outfile> -hmm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
71 \n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
72
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
73 # Variable declarations
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
74 my $line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
75 my @seqs; # sequences from infile (except >aa_ and >ss_pred sequences)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
76 my $query_length;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
77 my $header; # header of MSA: everything before first '>'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
78 my $name; # query in fasta format: '>$name [^\n]*\n$qseq\n'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
79 my $qseq; # residues of query sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
80 my $infile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
81 my $outfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
82 my $ss_pred=""; # psipred ss states
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
83 my $ss_conf=""; # psipred confidence values
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
84 my $ss_dssp; # dssp states as string
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
85 my $sa_dssp; # relative solvent accessibility from dssp as string {A,B,C,D,E} A:absolutely buried, B:buried, E:exposed
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
86 my $aa_dssp; # residues from dssp file as string
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
87 my $aa_astr; # residues from infile as string
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
88 my $q_match; # number of match states in query sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
89 my $xseq; # sequence x returned from Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
90 my $yseq; # sequence y returned from Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
91 my $Sstr; # match sequence returned from Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
92
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
93 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
94 # Processing command line input
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
95 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
96
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
97 if (@ARGV<1) {die ($help);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
98
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
99 my $options="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
100 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
101
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
102 #Input format fasta?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
103 if ($options=~s/ -fas\s/ /g) {$informat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
104 elsif ($options=~s/ -a2m\s/ /g) {$informat="a2m";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
105 elsif ($options=~s/ -a3m\s/ /g) {$informat="a3m";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
106 elsif ($options=~s/ -clu\s/ /g) {$informat="clu";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
107 elsif ($options=~s/ -sto\s/ /g) {$informat="sto";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
108 elsif ($options=~s/ -hmm\s/ /g) {$informat="hmm";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
109
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
110 if ($options=~s/ -v\s+(\d+) / /g) {$v=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
111
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
112 # Set input and output file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
113 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
114 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
115 if ($options=~s/^\s*([^-]\S*) //) {$infile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
116 if ($options=~s/^\s*([^-]\S*) //) {$outfile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
117
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
118 # Warn if unknown options found or no infile/outfile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
119 if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
120 if (!$infile) {print($help); exit(1);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
121
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
122 my $v2 = $v-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
123 if ($v2>2) {$v2--;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
124 if ($v2<0) {$v2=0;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
125
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
126 if ($informat eq "hmm" && !$outfile) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
127 print("Error: no output file given. With the -hmm option an output file is obligatory\n"); exit(1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
128 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
129
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
130 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
131 # Reformat input alignment to a3m and psiblast-readable format and generate file with query sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
132 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
133
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
134 my $inbase; # $inbasename of infile: remove extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
135 my $inroot; # $inbasename of infile: remove path and extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
136 if ($infile=~/(.*)\..*/) {$inbase=$1;} else {$inbase=$infile;} # remove extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
137 if ($inbase=~/.*\/(.*)/) {$inroot=$1;} else {$inroot=$inbase;} # remove path
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
138
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
139 # Create tmpfile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
140 my $tmpdir;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
141 if ($v<=3) {$tmpdir = tempdir( CLEANUP => 1);} else {$tmpdir = tempdir( CLEANUP => 0);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
142 my ($tmpf, $tmpfile) = tempfile( DIR => $tmpdir );
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
143 my $tmpfile_no_dir;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
144 if ($tmpfile=~/.*\/(.*)/) {$tmpfile_no_dir=$1;} else {$tmpfile_no_dir=$tmpfile;} # remove path
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
145
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
146
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
147
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
148 ############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
149
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
150 if ($informat ne "hmm") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
151 if (!$outfile) {$outfile="$inbase.a3m";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
152
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
153 # Use first sequence to define match states and reformat input file to a3m and psi
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
154 if ($informat ne "a3m") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
155 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -M first $informat a3m $infile $tmpfile.in.a3m");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
156 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
157 &HHPaths::System("cp $infile $tmpfile.in.a3m");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
158 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
159
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
160 # Read query sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
161 open (INFILE, "<$tmpfile.in.a3m") or die ("ERROR: cannot open $tmpfile.in.a3m!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
162 $/=">"; # set input field separator
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
163 my $i=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
164 $qseq="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
165 $header = <INFILE>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
166 $header =~s />$//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
167 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
168 $line=~s/>$//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
169 if ($line=~/^ss_/ || $line=~/^aa_/) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
170 $seqs[$i++]=">$line";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
171 if(!$qseq) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
172 $line=~s/^(.*)[^\n]*//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
173 $name=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
174 $qseq=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
175 $qseq=~s/\n//g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
176 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
177 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
178 close(INFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
179 $/="\n"; # set input field separator
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
180
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
181 if ($qseq =~ /\-/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
182
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
183 # First sequence contains gaps => calculate consensus sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
184 &HHPaths::System("hhconsensus -i $tmpfile.in.a3m -s $tmpfile.sq -o $tmpfile.in.a3m > /dev/null");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
185
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
186 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
187
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
188 $query_length = ($qseq=~tr/A-Z/A-Z/);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
189 $qseq=~tr/A-Z//cd; # remove everything except capital letters
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
190
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
191 # Write query sequence file in FASTA format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
192 open (QFILE, ">$tmpfile.sq") or die("ERROR: can't open $tmpfile.sq: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
193 printf(QFILE ">%s\n%s\n",$name,$qseq);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
194 close (QFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
195 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
196
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
197 # Filter alignment to diversity $neff
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
198 if ($v>=1) {printf ("Filtering alignment to diversity $neff ...\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
199 &HHPaths::System("hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
200
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
201 # Reformat into PSI-BLAST readable file for jumpstarting
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
202 &HHPaths::System("$hhscripts/reformat.pl -v $v2 -r -noss a3m psi $tmpfile.in.a3m $tmpfile.in.psi");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
203
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
204 open (ALIFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
205 printf (ALIFILE "%s",$header);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
206
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
207 # Add DSSP sequence (if available)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
208 if ($dssp ne "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
209 if (!&AppendDsspSequences("$tmpfile.sq")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
210 if ($numres) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
211 $ss_dssp=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
212 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
213 printf (ALIFILE ">ss_dssp\n%s\n",$ss_dssp);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
214 if ($v>=1) {print("\nAdding DSSP state sequence ...\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
215 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
216 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
217
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
218 # Secondary structure prediction with psipred
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
219 if ($v>=2) {print("Predicting secondary structure with PSIPRED ... ");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
220 &RunPsipred("$tmpfile.sq");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
221
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
222 if (open (PSIPREDFILE, "<$tmpfile.horiz")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
223 $ss_conf="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
224 $ss_pred="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
225 # Read Psipred file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
226 while ($line=<PSIPREDFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
227 if ($line=~/^Conf:\s+(\S+)/) {$ss_conf.=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
228 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
229 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
230 close(PSIPREDFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
231 $ss_conf=~tr/0-9/0/c; # replace all non-numerical symbols with a 0
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
232 if ($numres) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
233 $ss_pred=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
234 $ss_conf=~s/(\S{$numres})/$1\n/g; # insert a line break every $numres residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
235 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
236 printf(ALIFILE ">ss_pred PSIPRED predicted secondary structure\n%s\n",$ss_pred);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
237 printf(ALIFILE ">ss_conf PSIPRED confidence values\n%s\n",$ss_conf);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
238 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
239
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
240 # Append alignment sequences to psipred sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
241 for ($i=0; $i<@seqs; $i++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
242 printf(ALIFILE "%s",$seqs[$i]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
243 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
244 close(ALIFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
245 if ($v>=2) {print("done \n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
246 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
247 ##############################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
248 # HMMER format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
249 else
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
250 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
251 if (!$outfile) {$outfile="$inbase.hmm";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
252
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
253 my $log2 = log(2);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
254 my @logoddsmat;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
255 my @lines;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
256 my $length;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
257 my $query;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
258 my $scale=0.13; # empirically determined scale factor between HMMER bit score and PSI-BLAST score, 0.3 for HMMER3
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
259 my $acc;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
260 my $name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
261 my $desc;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
262 my $nmodels=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
263
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
264 open (INFILE, "<$infile") || die("ERROR: cannot open $infile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
265 open (OUTFILE, ">$outfile") || die("ERROR: cannot open $outfile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
266
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
267 # Read HMMER file model by model
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
268 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
269 # Search for start of next model
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
270 while ($line && $line!~/^HMMER/ && $line!~/^NAME /) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
271 $line=<INFILE>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
272 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
273 if ($line=~/^HMMER3/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
274
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
275 $scale = 0.3;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
276 @logoddsmat=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
277 @lines=($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
278
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
279 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
280 $line=~/^LENG\s+(\d+)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
281 $length=$1; # number of match states in HMM
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
282 $query=""; # query residues from NULL emission lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
283 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
284 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
285 if ($line !~ /^\s*COMPO/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
286 die("Error: need null-model probablities (Parameter COMPO)!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
287 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
288 $line=~s/^\s*COMPO\s+(\S.*\S)\s*$/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
289 my @nullmodel = split(/\s+/,$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
290 @nullmodel = map {$_ = exp(-1 * $_)} @nullmodel; # Transform to probabilities
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
291 push(@lines,$line=<INFILE>); # state 0 insert emission
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
292 push(@lines,$line=<INFILE>); # transisitions from begin state
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
293
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
294 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
295 push(@lines,$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
296 if ($line=~/^\/\//) {last;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
297 $line=~s/^\s*\d+\s+(\S.*\S)\s+\d+\s+(\S)\s+\S\s*$/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
298 $query .= $2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
299 my @probs = split(/\s+/,$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
300 @probs = map {$_ = exp(-1 * $_)} @probs; # Transform to probabilities
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
301 # calculate log-odds
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
302 my @logodds = ();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
303 for (my $a = 0; $a < scalar(@probs); $a++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
304 my $logodd = (log($probs[$a] / $nullmodel[$a]) / $log2) * 1000;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
305 push(@logodds, $logodd);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
306 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
307
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
308 push(@logoddsmat,\@logodds);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
309 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
310 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
311 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
312
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
313 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
314
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
315 $scale=0.13;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
316 if ($line!~/^HMMER/ && $line!~/^NAME /) {last;} # first line in each model must begin with 'HMMER...'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
317 @logoddsmat=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
318 @lines=($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
319
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
320 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^LENG/) {last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
321 $line=~/^LENG\s+(\d+)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
322 $length=$1; # number of match states in HMM
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
323 $query=""; # query residues from NULL emission lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
324 while ($line=<INFILE>) {push(@lines,$line); if ($line=~/^\s*m->m/) {last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
325 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
326 while ($line=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
327 push(@lines,$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
328 if ($line=~/^\/\//) {last;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
329 $line=~s/^\s*\d+\s+(\S.*\S)\s*$/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
330 my @logodds = split(/\s+/,$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
331 push(@logoddsmat,\@logodds);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
332 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
333 $line=~/^\s*(\S)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
334 $query .= $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
335 push(@lines,$line=<INFILE>);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
336 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
337 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
338
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
339 # Write mtx matrix
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
340 open (MTXFILE, ">$tmpfile.mtx") || die("ERROR: cannot open $tmpfile.mtx: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
341 printf(MTXFILE "%i\n",$length);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
342 printf(MTXFILE "%s\n",$query);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
343 printf(MTXFILE "2.670000e-03\n4.100000e-02\n-3.194183e+00\n1.400000e-01\n2.670000e-03\n4.420198e-02\n-3.118986e+00\n1.400000e-01\n3.176060e-03\n1.339561e-01\n-2.010243e+00\n4.012145e-01\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
344 while (@logoddsmat) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
345 my @logodds = @{shift(@logoddsmat)};
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
346 print(MTXFILE "-32768 ");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
347 splice(@logodds, 1,0,-32768/$scale); # insert logodds value for B
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
348 splice(@logodds,20,0, -100/$scale); # insert logodds value for X
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
349 splice(@logodds,22,0,-32768/$scale); # insert logodds value for Z
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
350 for (my $i=0; $i<23; $i++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
351 printf(MTXFILE "%4.0f ",$scale*$logodds[$i]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
352 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
353 print(MTXFILE "-32768 -400\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
354 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
355 close(MTXFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
356
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
357 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
358 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
359 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
360 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
361 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
362 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
363
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
364 # READ PSIPRED file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
365 if (open (PSIPRED, "$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss |")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
366 $ss_conf="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
367 $ss_pred="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
368 # Read Psipred file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
369 while ($line=<PSIPRED>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
370 if ($line=~/^Conf:\s+(\d+)/) {$ss_conf.=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
371 elsif ($line=~/^Pred:\s+(\S+)/) {$ss_pred.=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
372 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
373 close(PSIPREDFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
374 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
375
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
376 # Add secondary structure to HMMER output file and print
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
377 foreach $line (@lines) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
378 if ($line=~/^SSPRD/ || $line=~/^SSCON/|| $line=~/^SSCIT/) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
379 if ($line=~/^HMM /) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
380 $ss_pred=~s/(\S{80})/$1\nSSPRD /g; # insert a line break every 80 residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
381 $ss_conf=~s/(\S{80})/$1\nSSCON /g; # insert a line break every 80 residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
382 printf(OUTFILE "SSCIT HHsearch-readable PSIPRED secondary structure prediction:\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
383 printf(OUTFILE "SSPRD %s\n",$ss_pred);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
384 printf(OUTFILE "SSCON %s\n",$ss_conf);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
385 printf(OUTFILE "SSCIT %s\n",$ss_cit);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
386 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
387 printf(OUTFILE $line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
388 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
389 $nmodels++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
390 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
391
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
392 close(OUTFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
393 close(INFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
394 &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
395 if ($v>=2) {printf("Added PSIPRED secondary structure to %i models\n",$nmodels);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
396 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
397
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
398 if ($v<=3) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
399 unlink("$tmpfile.in.a3m");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
400 unlink("$tmpfile.in.psi");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
401 unlink("$tmpfile.horiz");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
402 unlink("$tmpfile.dssp");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
403 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
404
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
405 exit;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
406
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
407 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
408 # Run SS prediction starting from alignment in $tmpfile.in.psi (called by BuildAlignment)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
409 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
410 sub RunPsipred() {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
411 # This is a simple script which will carry out all of the basic steps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
412 # required to make a PSIPRED V2 prediction. Note that it assumes that the
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
413 # following programs are in the appropriate directories:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
414 # blastpgp - PSIBLAST executable (from NCBI toolkit)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
415 # makemat - IMPALA utility (from NCBI toolkit)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
416 # psipred - PSIPRED V2 program
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
417 # psipass2 - PSIPRED V2 program
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
418
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
419 my $infile=$_[0];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
420 my $basename; #file name without extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
421 my $rootname; #basename without directory path
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
422 if ($infile =~/^(.*)\..*?$/) {$basename=$1;} else {$basename=$infile;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
423 if ($basename=~/^.*\/(.*?)$/) {$rootname=$1;} else {$rootname=$basename;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
424
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
425 # Does dummy database exist?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
426 if (!-e "$dummydb.phr") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
427 if (!-e "$dummydb") {die "Error in addss.pl: Could not find $dummydb\n";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
428
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
429 &HHPaths::System("cp $infile $dummydb");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
430 &HHPaths::System("$ncbidir/formatdb -i $dummydb");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
431 if (!-e "$dummydb.phr") {die "Error in addss.pl: Could not find nor create index files for $dummydb\n";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
432 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
433
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
434 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
435 &HHPaths::System("$ncbidir/blastpgp -b 1 -j 1 -h 0.001 -d $dummydb -i $infile -B $tmpfile.in.psi -C $tmpfile.chk 1> $tmpfile.blalog 2> $tmpfile.blalog");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
436
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
437 #print("Predicting secondary structure...\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
438
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
439 &HHPaths::System("echo "."$tmpfile_no_dir".".chk > $tmpfile.pn\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
440 &HHPaths::System("echo "."$tmpfile_no_dir".".sq > $tmpfile.sn\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
441 &HHPaths::System("$ncbidir/makemat -P $tmpfile");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
442
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
443 # Start Psiblast from checkpoint file tmp.chk that was generated to build the profile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
444 if (-e "$datadir/weights.dat4") { # Psipred version < 3.0
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
445 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 $datadir/weights.dat4 > $tmpfile.ss");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
446 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
447 &HHPaths::System("$execdir/psipred $tmpfile.mtx $datadir/weights.dat $datadir/weights.dat2 $datadir/weights.dat3 > $tmpfile.ss");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
448 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
449
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
450 &HHPaths::System("$execdir/psipass2 $datadir/weights_p2.dat 1 0.98 1.09 $tmpfile.ss2 $tmpfile.ss > $tmpfile.horiz");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
451
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
452 # Remove temporary files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
453 if ($v<=3) { unlink(split ' ', "$tmpfile.pn $tmpfile.sn $tmpfile.mn $tmpfile.chk $tmpfile.blalog $tmpfile.mtx $tmpfile.aux $tmpfile.ss $tmpfile.ss2 $tmpfile.sq");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
454 return;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
455 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
456
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
457 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
458 # Read query sequence and extract dssp sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
459 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
460 sub AppendDsspSequences() {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
461 my $qfile=$_[0];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
462
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
463 my $line; #input line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
464 my $name; #name of sequence in in file, e.g. d1g8ma1
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
465 my $qrange; #chain and residue range of query sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
466 my $aas=""; #amino acids from in file for each $name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
467
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
468 my $dsspfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
469 my $pdbfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
470 my $pdbcode; #pdb code for accessing dssp file; shortened from in code, e.g. 1g8m
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
471 my @ss_dssp=(); #dssp states for residues (H,E,L)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
472 my @sa_dssp=(); #dssp states for residues (H,E,L)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
473 my @aa_dssp=(); #residues in dssp file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
474 my @aa_astr=(); #residues from infile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
475 my $length; #length of sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
476
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
477 # Default parameters for Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
478 our $d=3; # gap opening penatlty for Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
479 our $e=0.1; # gap extension penatlty for Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
480 our $g=0.09; # endgap penatlty for Align.pm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
481 our $matrix="identity";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
482
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
483 # Read query sequence -> $name, $nameline, $range, $aas
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
484 open (QFILE, "<$qfile") || die ("cannot open $qfile: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
485 while ($line=<QFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
486 if ($line=~/>(\S+)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
487 $name=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
488
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
489 # SCOP ID? (d3lkfa_,d3grs_3,d3pmga1,g1m26.1)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
490 if ($line=~/^>[defgh](\d[a-z0-9]{3})[a-z0-9_.][a-z0-9_]\s+[a-z]\.\d+\.\d+\.\d+\s+\((\S+)\)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
491 $pdbcode=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
492 $qrange=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
493 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
494
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
495 # PDB ID? (8fab_A, 1a0i)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
496 elsif ($line=~/^>(\d[a-z0-9]{3})_?(\S?)\s/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
497 $pdbcode=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
498 if ($2 ne "") {$qrange="$2:";} else {$qrange="-";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
499 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
500
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
501 # DALI ID? (8fabA_0,1a0i_2)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
502 elsif ($line=~/^>(\d[a-z0-9]{3})[A-Za-z0-9]?_\d+\s+\d+\.\d+.\d+.\d+.\d+.\d+\s+\((\S+)\)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
503 $pdbcode=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
504 $qrange=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
505 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
506
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
507 else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
508 if ($v>=3) {print("Warning: no pdb code found in sequence name '$name'\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
509 close(QFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
510 return 1; # no astral/DALI/pdb sequence => no dssp states available
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
511 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
512 $aas="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
513
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
514 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
515 else
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
516 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
517 chomp($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
518 $line=~tr/a-z \t/A-Z/d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
519 $aas.=$line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
520 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
521 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
522 close(QFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
523 if ($v>=3) {printf("Searching DSSP state assignments: name=%s range=%s\n",$name,$qrange);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
524
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
525 # Try to open dssp file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
526 $dsspfile="$dsspdir/$pdbcode.dssp";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
527 if (! open (DSSPFILE, "<$dsspfile")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
528 if ($v>=3) {printf(STDERR "Warning in $program: Cannot open $dsspfile!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
529 $pdbfile = &OpenPDBfile($pdbcode);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
530 if ($pdbfile eq "") {return;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
531
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
532 system("$dssp $pdbfile $tmpfile.dssp 2> /dev/null");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
533 system("cp $tmpfile.dssp $dsspfile 2> /dev/null");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
534 $dsspfile="$tmpfile.dssp";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
535 if (! open (DSSPFILE, "<$dsspfile")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
536 if ($v>=3) {printf(STDERR "Warning in $program: dssp couldn't generate file from $pdbfile. Skipping $name\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
537 return 1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
538 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
539 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
540
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
541 #....+....1....+....2....+....3....+....4
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
542 # # RESIDUE AA STRUCTURE BP1 BP2 ACC etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
543 # 623 630 A R < 0 0 280 etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
544 # 624 !* 0 0 0 etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
545 # 625 8 B A 0 0 105 etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
546 # 626 9 B P >> - 0 0 71 etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
547 # 292 28SA K H 4 S+ 0 0 71 etc. (1qdm.dssp)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
548 # 293 29SA K H > S+ 0 0 28 etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
549
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
550 # Read in whole DSSP file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
551 for (my $try = 1; $try<=2; $try++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
552 $aa_dssp="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
553 $sa_dssp="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
554 $ss_dssp="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
555 while ($line=<DSSPFILE>) {if ($line=~/^\s*\#\s*RESIDUE\s+AA/) {last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
556 while ($line=<DSSPFILE>)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
557 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
558 if ($line=~/^.{5}(.{5})(.)(.)\s(.).\s(.).{18}(...)/)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
559 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
560 my $thisres=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
561 my $icode=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
562 my $chain=$3;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
563 my $aa=$4;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
564 my $ss=$5;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
565 my $sa=$6;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
566 my $contained=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
567 my $range=$qrange;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
568 if ($aa eq "!") {next;} # missing residues!
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
569 $thisres=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
570 $chain=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
571 $icode=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
572 $sa=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
573 if ($try==1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
574 do{
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
575 if ($range=~s/^(\S):(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq $1 && $icode eq $4 && $2<=$thisres && $thisres<=$3) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
576 $contained=1; #syntax (A:56S-135S)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
577 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
578 elsif ($range=~s/^(\S):(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq $1 && $2<=$thisres && $thisres<=$3) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
579 $contained=1; #syntax (R:56-135)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
580 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
581 elsif ($range=~s/^(-?\d+)[A-Z]-(\d+)([A-Z])// && $chain eq "" && $icode eq $3 && $1<=$thisres && $thisres<=$2) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
582 $contained=1; #syntax (56-135)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
583 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
584 elsif ($range=~s/^(-?\d+)[A-Z]?-(\d+)[A-Z]?// && $chain eq "" && $1<=$thisres && $thisres<=$2) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
585 $contained=1; #syntax (56-135)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
586 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
587 elsif ($range=~s/^(\S):// && $chain eq $1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
588 $contained=1; #syntax (A:) or (A:,2:)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
589 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
590 elsif ($range=~s/^-$// && $chain eq "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
591 $contained=1; #syntax (-)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
592 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
593 $range=~s/^,//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
594 # print("qrange=$qrange range='$range' ires=$thisres chain=$chain contained=$contained\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
595 } while($contained==0 && $range ne "");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
596 if ($contained==0) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
597 } # end if try==1
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
598 $aa_dssp.=$aa;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
599 $ss_dssp.=$ss;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
600 $sa_dssp.=&sa2c($sa,$aa);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
601 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
602 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
603 # if not enough residues were found: chain id is wrong => repeat extraction without checking chain id
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
604 if (length($aa_dssp)>=10) {last;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
605 close(DSSPFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
606 open (DSSPFILE, "<$dsspfile");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
607 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
608 close(DSSPFILE);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
609
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
610 if (length($aa_dssp)==0) {print("WARNING: no residues found in $dsspdir/$pdbcode.dssp\n"); return 1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
611 if (length($aa_dssp)<=20) {printf("WARNING: only %i residues found in $dsspdir/$pdbcode.dssp\n",length($aa_dssp)); return 1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
612
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
613 # Postprocess $aa_dssp etc
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
614 $aa_dssp =~ tr/a-z/CCCCCCCCCCCCCCCCCCCCCCCCCC/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
615 $ss_dssp =~ tr/ I/CC/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
616 $ss_dssp =~ s/ \S / /g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
617 $ss_dssp =~ s/ \S\S / /g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
618
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
619 # Align query with dssp sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
620 $aa_astr = $aas;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
621 $xseq=$aas;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
622 $yseq=$aa_dssp;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
623 my ($imax,$imin,$jmax,$jmin);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
624 my (@i,@j);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
625 my $score=&AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
626
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
627 # Initialize strings (=arrays) for dssp states with "----...-"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
628 my @ss_dssp_ali=(); # $ss_dssp_ali[$i] is dssp state aligned to $aa_astr[$i]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
629 my @sa_dssp_ali=(); # $sa_dssp_ali[$i] is solvent accessibility string
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
630 my @aa_dssp_ali=(); # $aa_dssp_ali[$i] is dssp residue aligned to $aa_astr[$i]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
631 for (my $i=0; $i<=length($aa_astr); $i++) { # sum up to len+1
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
632 # because 0'th element in @ss_dssp and @aa_dssp is dummy "-"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
633 $ss_dssp_ali[$i]="-";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
634 $sa_dssp_ali[$i]="-";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
635 $aa_dssp_ali[$i]="-";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
636 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
637
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
638 # To each residue (from i=0 to len-1) of input sequence $aa_astr assign aligned dssp state
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
639 @ss_dssp = split(//,$ss_dssp);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
640 @sa_dssp = split(//,$sa_dssp);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
641 @aa_dssp = split(//,$aa_dssp);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
642 @aa_astr = split(//,$aa_astr);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
643 my $len = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
644 unshift(@aa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
645 unshift(@ss_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
646 unshift(@sa_dssp,"-"); #add a gap symbol at beginning -> first residue is at 1!
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
647 unshift(@aa_astr,"-"); #add a gap symbol at beginning -> first residue is at 1!
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
648 for (my $col=0; $col<@i; $col++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
649 if ($i[$col]>0) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
650 if ($j[$col]>0) {$len++;} # count match states (for score/len calculation)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
651 $ss_dssp_ali[$i[$col]]=$ss_dssp[$j[$col]];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
652 $sa_dssp_ali[$i[$col]]=$sa_dssp[$j[$col]];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
653 $aa_dssp_ali[$i[$col]]=$aa_dssp[$j[$col]];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
654 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
655 if ($v>=4) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
656 printf ("%s %3i %s %3i\n",$aa_astr[$i[$col]],$i[$col],$aa_dssp[$j[$col]],$j[$col]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
657 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
658 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
659 shift (@ss_dssp_ali); # throw out first "-"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
660 shift (@sa_dssp_ali); # throw out first "-"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
661 shift (@aa_dssp_ali); # throw out first "-"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
662 $aa_dssp=join("",@aa_dssp_ali);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
663 $ss_dssp=join("",@ss_dssp_ali);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
664 $sa_dssp=join("",@sa_dssp_ali);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
665
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
666 # Debugging output
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
667 if ($v>=4) {printf(STDOUT "DSSP: %s: length=%-3i score/len:%-5.3f\n",$name,$len,$score/$len);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
668 if ($v>=4) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
669 printf("IN: %s\n",$xseq);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
670 printf("MATCH: %s\n",$Sstr);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
671 printf("DSSP: %s\n",$yseq);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
672 printf("\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
673 printf(">ss_dssp $name\n$ss_dssp\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
674 printf(">sa_dssp $name\n$sa_dssp\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
675 printf(">aa_dssp $name\n$aa_dssp\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
676 printf(">aa_astra $name\n$aa_astr\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
677 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
678 if ($score/$len<0.5) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
679 printf (STDOUT "\nWARNING: in $name: alignment score with dssp residues too low: Score/len=%f.\n\n",$score/$len);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
680 printf("IN: %s\n",$xseq);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
681 printf("MATCH: %s\n",$Sstr);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
682 printf("DSSP: %s\n",$yseq);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
683 return 1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
684 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
685
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
686 return 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
687 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
688
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
689 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
690 ### Return solvent accessibility code
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
691 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
692 sub sa2c ()
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
693 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
694 my %maxsa = (A=>106, B=>160, C=>135, D=>163, E=>194, F=>197, G=>84, H=>184, I=>169, K=>205, L=>164, M=>188,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
695 N=>157, P=>136, Q=>198, R=>248, S=>130, T=>142, V=>142, W=>227, X=>180, Y=>222, Z=>196); # maximum solvent accessiblity
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
696 if ($_[1]=~/[a-z]/) {return "F";} # disulphide bridge
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
697 if (!defined $maxsa{$_[1]}) {return "-";} # no amino acid
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
698 my $rsa=$_[0]/$maxsa{$_[1]};
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
699 # printf("aa=%s sa=%5.1f max_sa=%5.1f rsa=%5.3f\n",$_[1],$_[0],$maxsa{$_[1]},$rsa);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
700 if ($rsa<=0.02) {return "A";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
701 elsif ($rsa<=0.14) {return "B";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
702 elsif ($rsa<=0.33) {return "C";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
703 elsif ($rsa<=0.55) {return "D";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
704 else {return "E";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
705 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
706
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
707 # Find the pdb file with $pdbcode in pdb directory
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
708 sub OpenPDBfile() {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
709
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
710 my $pdbcode=lc($_[0]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
711 if (! -e "$pdbdir") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
712 if ($v>=3) {print(STDERR "Warning in $program: pdb directory '$pdbdir' does not exist!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
713 return 1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
714 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
715 if (-e "$pdbdir/all") {$pdbfile="$pdbdir/all/";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
716 elsif (-e "$pdbdir/divided") {$pdbfile="$pdbdir/divided/".substr($pdbcode,1,2)."/";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
717 else {$pdbfile="$pdbdir/";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
718 if ($pdbdir=~/divided.?$/) {$pdbfile.=substr($pdbcode,1,2)."/";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
719 if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
720 elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
721 elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
722 elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
723 else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
724 if ($v>=3) {printf(STDERR "Warning in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
725 return "";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
726 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
727 if (!open (PDBFILE, "$pdbfile")) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
728 if ($v>=3) {printf(STDERR "Error in $program: Cannot open pdb file: $!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
729 return "";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
730 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
731 return $pdbfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
732 }