comparison external_tools/darwin/lib/hh/scripts/create_profile_from_hmmer.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
comparison
equal deleted inserted replaced
5:b7652b7c97bd 6:2277dd59b9f9
1 #!/usr/bin/env perl
2 #
3 # create_profile_from_hmmer.pl
4 # Create a profile (.prf) from a given HMMER/HMMER3 file
5
6 # HHsuite version 2.0.16 (January 2013)
7 #
8 # Reference:
9 # Remmert M., Biegert A., Hauser A., and Soding J.
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
12
13 # (C) Michael Remmert and Johannes Soeding, 2012
14
15 # This program is free software: you can redistribute it and/or modify
16 # it under the terms of the GNU General Public License as published by
17 # the Free Software Foundation, either version 3 of the License, or
18 # (at your option) any later version.
19
20 # This program is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 # GNU General Public License for more details.
24
25 # You should have received a copy of the GNU General Public License
26 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
29
30 use lib $ENV{"HHLIB"}."/scripts";
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
32 use strict;
33
34 $|= 1; # Activate autoflushing on STDOUT
35
36 # Default values:
37 our $v=2; # verbose mode
38 my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER))
39 my $scale = 1000;
40 my $log2 = log(2);
41
42 my $help="
43 create_profile_from_hmmer.pl from HHsuite $VERSION
44 Create a profile (.prf) from a given HMMER/HMMER3 file
45
46 Usage: perl create_profile_from_hmmer.pl -i <infile> [-o <outfile>]
47
48 Options:
49 -i <infile> Input file in HMMER/HMMER3 format
50 -o <outfile> Output file in prf-format (default: infile.prf)
51
52 -v [0-5] verbose mode (default: $v)
53 \n";
54
55 # Variable declarations
56 my $line;
57 my $infile;
58 my $outfile;
59 my $i;
60 my $a;
61
62 my @counts; # count profile (normalised to 1)
63 my $name; # name of HMMER profile
64 my $len; # length of HMMER profile
65 my @hmmer_prof; # HMMER profile
66
67 # A C D E F G H I K L M N P Q R S T V W Y
68 my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
69 my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
70 my %aa2i;
71 for ($a = 0; $a < 20; $a++) {
72 $aa2i{$aminoacids[$a]} = $a;
73 }
74
75 ###############################################################################################
76 # Processing command line input
77 ###############################################################################################
78
79 if (@ARGV<1) {die ($help);}
80
81 my $options="";
82 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
83
84 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
85 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
86
87 if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;}
88
89 if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
90
91 if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
92
93 if (!$outfile) {
94 $infile =~ /^(\S+)\.\S+?$/;
95 $outfile = "$1.prf";
96 }
97
98 ##############################################################################################
99 # Main part
100 ##############################################################################################
101
102 ######################################
103 # Read HMMER sequence and profile
104 ######################################
105 open (IN, $infile);
106 $line = <IN>;
107
108 if ($line =~ /^HMMER3/) {
109 while ($line = <IN>) {
110 if ($line =~ /^NAME\s+(\S+)/) {
111 $name = $1;
112 } elsif ($line =~ /^LENG\s+(\d+)/) {
113 $len = $1;
114 } elsif ($line =~ /^HMM/) {
115 last;
116 }
117 }
118 $line = <IN>; $line = <IN>; # Skip header lines
119 if ($line =~ /^\s*COMPO/) {
120 $line = <IN>; $line = <IN>; # Skip null model lines
121 }
122 # Read profiles and query seq
123 $i = 0;
124 while ($line = <IN>) {
125 if ($line =~ /^\/\//) { last; }
126
127 $line =~ s/^\s*\d+//; # sequence position
128 for ($a = 0; $a < 20; $a++) {
129 $line =~ s/^\s*(\S+)\s/ /;
130 $hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1);
131 }
132
133 # Read query char in count profile
134 for ($a = 0; $a < 20; $a++) {
135 $counts[$i][$a] = 0;
136 }
137 $line =~ /^\s*\d+\s+(\S)/;
138 $counts[$i][$aa2i{$1}] = 1;
139
140 $line = <IN>; $line = <IN>;
141 $i++;
142
143 }
144 } elsif ($line =~ /^HMMER/) {
145 my @pb;
146 while ($line = <IN>) {
147 if ($line =~ /^NAME\s+(\S+)/) {
148 $name = $1;
149 } elsif ($line =~ /^LENG\s+(\d+)/) {
150 $len = $1;
151 } elsif ($line =~ /^NULE/) {
152 $line =~ s/^NULE//; # sequence position
153 for ($a = 0; $a < 20; $a++) {
154 $line =~ s/^\s*(\S+)\s/ /;
155 # pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE);
156 $pb[$a] = 0.05 * (2**($1/1000));
157 }
158 } elsif ($line =~ /^HMM/) {
159 last;
160 }
161 }
162
163
164 $line = <IN>; $line = <IN>; # Skip header lines
165 # Read profiles and query seq
166 $i = 0;
167 while ($line = <IN>) {
168 if ($line =~ /^\/\//) { last; }
169
170 $line =~ s/^\s*\d+//; # sequence position
171 for ($a = 0; $a < 20; $a++) {
172 $line =~ s/^\s*(\S+)\s/ /;
173 # prob = pb[a]*fpow2(float(ptr)/HMMSCALE);
174 $hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000));
175 }
176
177 # Read query char in count profile
178 $line = <IN>;
179 for ($a = 0; $a < 20; $a++) {
180 $counts[$i][$a] = 0;
181 }
182 $line =~ /^\s*(\S)/;
183 $counts[$i][$aa2i{$1}] = 1;
184
185 $line = <IN>;
186 $i++;
187 }
188
189 } else {
190 print($help);
191 print "ERROR! Unknown input format!\n";
192 exit(1);
193 }
194 ######################################
195 # build count_profile (mix orig sequence and HMMER-profile)
196 ######################################
197 for ($i = 0; $i < $len; $i++) {
198 my $sum = 0;
199 for ($a = 0; $a < 20; $a++) {
200 $counts[$i][$a] += $factor * $hmmer_prof[$i][$a];
201 $sum += $counts[$i][$a];
202 }
203 # Normalize to one
204 my $fac = 1 / $sum;
205 for ($a = 0; $a < 20; $a++) {
206 $counts[$i][$a] *= $fac;
207 }
208 }
209
210 ######################################
211 # write count_profile
212 ######################################
213
214 open (OUT, ">$outfile");
215 # Write header
216 printf(OUT "CountProfile\n");
217 printf(OUT "NAME\t%s\n", $name);
218 printf(OUT "LENG\t%i\n", $len);
219 printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet
220 printf(OUT "COUNTS");
221 for ($a = 0; $a < 20; $a++) {
222 printf(OUT "\t%s", $aminoacids[$a]);
223 }
224 printf(OUT "\tNEFF\n");
225
226 # Write profile
227 for ($i = 0; $i < $len; $i++) {
228 printf(OUT "%i", $i+1);
229 for ($a = 0; $a < 20; $a++) {
230 if ($counts[$i][$a] == 0) {
231 printf(OUT "\t*");
232 } else {
233 printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5));
234 }
235 }
236 printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1
237 }
238
239 printf(OUT "//\n");
240 close OUT;
241
242 exit;
243
244 sub log2()
245 {
246 my $n = shift;
247 return (log($n)/$log2);
248 }