Mercurial > repos > hammock > hammock
comparison external_tools/darwin/lib/hh/scripts/create_profile_from_hmmer.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:b7652b7c97bd | 6:2277dd59b9f9 |
---|---|
1 #!/usr/bin/env perl | |
2 # | |
3 # create_profile_from_hmmer.pl | |
4 # Create a profile (.prf) from a given HMMER/HMMER3 file | |
5 | |
6 # HHsuite version 2.0.16 (January 2013) | |
7 # | |
8 # Reference: | |
9 # Remmert M., Biegert A., Hauser A., and Soding J. | |
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
12 | |
13 # (C) Michael Remmert and Johannes Soeding, 2012 | |
14 | |
15 # This program is free software: you can redistribute it and/or modify | |
16 # it under the terms of the GNU General Public License as published by | |
17 # the Free Software Foundation, either version 3 of the License, or | |
18 # (at your option) any later version. | |
19 | |
20 # This program is distributed in the hope that it will be useful, | |
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
23 # GNU General Public License for more details. | |
24 | |
25 # You should have received a copy of the GNU General Public License | |
26 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
27 | |
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
29 | |
30 use lib $ENV{"HHLIB"}."/scripts"; | |
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
32 use strict; | |
33 | |
34 $|= 1; # Activate autoflushing on STDOUT | |
35 | |
36 # Default values: | |
37 our $v=2; # verbose mode | |
38 my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER)) | |
39 my $scale = 1000; | |
40 my $log2 = log(2); | |
41 | |
42 my $help=" | |
43 create_profile_from_hmmer.pl from HHsuite $VERSION | |
44 Create a profile (.prf) from a given HMMER/HMMER3 file | |
45 | |
46 Usage: perl create_profile_from_hmmer.pl -i <infile> [-o <outfile>] | |
47 | |
48 Options: | |
49 -i <infile> Input file in HMMER/HMMER3 format | |
50 -o <outfile> Output file in prf-format (default: infile.prf) | |
51 | |
52 -v [0-5] verbose mode (default: $v) | |
53 \n"; | |
54 | |
55 # Variable declarations | |
56 my $line; | |
57 my $infile; | |
58 my $outfile; | |
59 my $i; | |
60 my $a; | |
61 | |
62 my @counts; # count profile (normalised to 1) | |
63 my $name; # name of HMMER profile | |
64 my $len; # length of HMMER profile | |
65 my @hmmer_prof; # HMMER profile | |
66 | |
67 # A C D E F G H I K L M N P Q R S T V W Y | |
68 my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18); | |
69 my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'); | |
70 my %aa2i; | |
71 for ($a = 0; $a < 20; $a++) { | |
72 $aa2i{$aminoacids[$a]} = $a; | |
73 } | |
74 | |
75 ############################################################################################### | |
76 # Processing command line input | |
77 ############################################################################################### | |
78 | |
79 if (@ARGV<1) {die ($help);} | |
80 | |
81 my $options=""; | |
82 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} | |
83 | |
84 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;} | |
85 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;} | |
86 | |
87 if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;} | |
88 | |
89 if ($options=~s/ -v\s+(\S+) //) {$v=$1;} | |
90 | |
91 if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);} | |
92 | |
93 if (!$outfile) { | |
94 $infile =~ /^(\S+)\.\S+?$/; | |
95 $outfile = "$1.prf"; | |
96 } | |
97 | |
98 ############################################################################################## | |
99 # Main part | |
100 ############################################################################################## | |
101 | |
102 ###################################### | |
103 # Read HMMER sequence and profile | |
104 ###################################### | |
105 open (IN, $infile); | |
106 $line = <IN>; | |
107 | |
108 if ($line =~ /^HMMER3/) { | |
109 while ($line = <IN>) { | |
110 if ($line =~ /^NAME\s+(\S+)/) { | |
111 $name = $1; | |
112 } elsif ($line =~ /^LENG\s+(\d+)/) { | |
113 $len = $1; | |
114 } elsif ($line =~ /^HMM/) { | |
115 last; | |
116 } | |
117 } | |
118 $line = <IN>; $line = <IN>; # Skip header lines | |
119 if ($line =~ /^\s*COMPO/) { | |
120 $line = <IN>; $line = <IN>; # Skip null model lines | |
121 } | |
122 # Read profiles and query seq | |
123 $i = 0; | |
124 while ($line = <IN>) { | |
125 if ($line =~ /^\/\//) { last; } | |
126 | |
127 $line =~ s/^\s*\d+//; # sequence position | |
128 for ($a = 0; $a < 20; $a++) { | |
129 $line =~ s/^\s*(\S+)\s/ /; | |
130 $hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1); | |
131 } | |
132 | |
133 # Read query char in count profile | |
134 for ($a = 0; $a < 20; $a++) { | |
135 $counts[$i][$a] = 0; | |
136 } | |
137 $line =~ /^\s*\d+\s+(\S)/; | |
138 $counts[$i][$aa2i{$1}] = 1; | |
139 | |
140 $line = <IN>; $line = <IN>; | |
141 $i++; | |
142 | |
143 } | |
144 } elsif ($line =~ /^HMMER/) { | |
145 my @pb; | |
146 while ($line = <IN>) { | |
147 if ($line =~ /^NAME\s+(\S+)/) { | |
148 $name = $1; | |
149 } elsif ($line =~ /^LENG\s+(\d+)/) { | |
150 $len = $1; | |
151 } elsif ($line =~ /^NULE/) { | |
152 $line =~ s/^NULE//; # sequence position | |
153 for ($a = 0; $a < 20; $a++) { | |
154 $line =~ s/^\s*(\S+)\s/ /; | |
155 # pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE); | |
156 $pb[$a] = 0.05 * (2**($1/1000)); | |
157 } | |
158 } elsif ($line =~ /^HMM/) { | |
159 last; | |
160 } | |
161 } | |
162 | |
163 | |
164 $line = <IN>; $line = <IN>; # Skip header lines | |
165 # Read profiles and query seq | |
166 $i = 0; | |
167 while ($line = <IN>) { | |
168 if ($line =~ /^\/\//) { last; } | |
169 | |
170 $line =~ s/^\s*\d+//; # sequence position | |
171 for ($a = 0; $a < 20; $a++) { | |
172 $line =~ s/^\s*(\S+)\s/ /; | |
173 # prob = pb[a]*fpow2(float(ptr)/HMMSCALE); | |
174 $hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000)); | |
175 } | |
176 | |
177 # Read query char in count profile | |
178 $line = <IN>; | |
179 for ($a = 0; $a < 20; $a++) { | |
180 $counts[$i][$a] = 0; | |
181 } | |
182 $line =~ /^\s*(\S)/; | |
183 $counts[$i][$aa2i{$1}] = 1; | |
184 | |
185 $line = <IN>; | |
186 $i++; | |
187 } | |
188 | |
189 } else { | |
190 print($help); | |
191 print "ERROR! Unknown input format!\n"; | |
192 exit(1); | |
193 } | |
194 ###################################### | |
195 # build count_profile (mix orig sequence and HMMER-profile) | |
196 ###################################### | |
197 for ($i = 0; $i < $len; $i++) { | |
198 my $sum = 0; | |
199 for ($a = 0; $a < 20; $a++) { | |
200 $counts[$i][$a] += $factor * $hmmer_prof[$i][$a]; | |
201 $sum += $counts[$i][$a]; | |
202 } | |
203 # Normalize to one | |
204 my $fac = 1 / $sum; | |
205 for ($a = 0; $a < 20; $a++) { | |
206 $counts[$i][$a] *= $fac; | |
207 } | |
208 } | |
209 | |
210 ###################################### | |
211 # write count_profile | |
212 ###################################### | |
213 | |
214 open (OUT, ">$outfile"); | |
215 # Write header | |
216 printf(OUT "CountProfile\n"); | |
217 printf(OUT "NAME\t%s\n", $name); | |
218 printf(OUT "LENG\t%i\n", $len); | |
219 printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet | |
220 printf(OUT "COUNTS"); | |
221 for ($a = 0; $a < 20; $a++) { | |
222 printf(OUT "\t%s", $aminoacids[$a]); | |
223 } | |
224 printf(OUT "\tNEFF\n"); | |
225 | |
226 # Write profile | |
227 for ($i = 0; $i < $len; $i++) { | |
228 printf(OUT "%i", $i+1); | |
229 for ($a = 0; $a < 20; $a++) { | |
230 if ($counts[$i][$a] == 0) { | |
231 printf(OUT "\t*"); | |
232 } else { | |
233 printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5)); | |
234 } | |
235 } | |
236 printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1 | |
237 } | |
238 | |
239 printf(OUT "//\n"); | |
240 close OUT; | |
241 | |
242 exit; | |
243 | |
244 sub log2() | |
245 { | |
246 my $n = shift; | |
247 return (log($n)/$log2); | |
248 } |