comparison external_tools/linux/lib/hh/scripts/create_profile_from_hhm.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
comparison
equal deleted inserted replaced
5:b7652b7c97bd 6:2277dd59b9f9
1 #!/usr/bin/env perl
2 #
3 # create_profile_from_hhm.pl
4 # Create a profile (.prf) from a given HHM file
5
6 # HHsuite version 2.0.16 (January 2013)
7 #
8 # Reference:
9 # Remmert M., Biegert A., Hauser A., and Soding J.
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
12
13 # (C) Michael Remmert and Johannes Soeding, 2012
14
15 # This program is free software: you can redistribute it and/or modify
16 # it under the terms of the GNU General Public License as published by
17 # the Free Software Foundation, either version 3 of the License, or
18 # (at your option) any later version.
19
20 # This program is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 # GNU General Public License for more details.
24
25 # You should have received a copy of the GNU General Public License
26 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
29
30 use lib $ENV{"HHLIB"}."/scripts";
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
32 use strict;
33
34 $|= 1; # Activate autoflushing on STDOUT
35
36 # Default values:
37 our $v=2; # verbose mode
38
39 my $help="
40 create_profile_from_hhm.pl from HHsuite $VERSION
41 Create a profile (.prf) from a given HHM file
42
43 Usage: perl create_profile_from_hhm.pl -i <infile> [-o <outfile>]
44
45 Options:
46 -i <infile> Input file in HHM format
47 -o <outfile> Output file in prf-format (default: infile.prf)
48
49 -v [0-5] verbose mode (default: $v)
50 \n";
51
52 # Variable declarations
53 my $line;
54 my $infile;
55 my $outfile;
56 my $i;
57 my $a;
58
59 my @counts; # count profile
60 my @neffs;
61 my $name; # name of HHM profile
62 my $len; # length of HHM profile
63
64 # A C D E F G H I K L M N P Q R S T V W Y
65 my @hhmaa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
66 my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
67 my %aa2i;
68 for ($a = 0; $a < 20; $a++) {
69 $aa2i{$aminoacids[$a]} = $a;
70 }
71
72 ###############################################################################################
73 # Processing command line input
74 ###############################################################################################
75
76 if (@ARGV<1) {die ($help);}
77
78 my $options="";
79 for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
80
81 if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
82 if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
83
84 if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
85
86 if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
87
88 if (!$outfile) {
89 $infile =~ /^(\S+)\.\S+?$/;
90 $outfile = "$1.prf";
91 }
92
93 ##############################################################################################
94 # Main part
95 ##############################################################################################
96
97 ######################################
98 # Read HHM profile
99 ######################################
100 open (IN, $infile);
101 while ($line = <IN>) {
102 if ($line =~ /^NAME\s+(\S+)/) {
103 $name = $1;
104 } elsif ($line =~ /^LENG\s+(\d+)/) {
105 $len = $1;
106 } elsif ($line =~ /^HMM/) {
107 last;
108 }
109 }
110
111 $i = 0;
112 while ($line = <IN>) {
113 if ($line =~ /^\/\//) { last; }
114
115 if ($line =~ s/^\S \d+ //) {
116
117 for ($a = 0; $a < 20; $a++) {
118 $line =~ s/^\s*(\S+)\s/ /;
119 $counts[$i][$hhmaa2csaa[$a]] = $1;
120 if ($counts[$i][$hhmaa2csaa[$a]] !~ /\*/ && $counts[$i][$hhmaa2csaa[$a]] == 0) {
121 $counts[$i][$hhmaa2csaa[$a]] = 1;
122 }
123 }
124
125 $line = <IN>;
126 $line =~ /^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+/;
127 $neffs[$i] = $1;
128
129 $i++;
130 }
131 }
132
133 ######################################
134 # write count_profile
135 ######################################
136
137 open (OUT, ">$outfile");
138 # Write header
139 printf(OUT "CountProfile\n");
140 printf(OUT "NAME\t%s\n", $name);
141 printf(OUT "LENG\t%i\n", $len);
142 printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet
143 printf(OUT "COUNTS");
144 for ($a = 0; $a < 20; $a++) {
145 printf(OUT "\t%s", $aminoacids[$a]);
146 }
147 printf(OUT "\tNEFF\n");
148
149 # Write profile
150 for ($i = 0; $i < $len; $i++) {
151 printf(OUT "%i", $i+1);
152 for ($a = 0; $a < 20; $a++) {
153 if ($counts[$i][$a] == '*') {
154 printf(OUT "\t*");
155 } else {
156 printf(OUT "\t%i", $counts[$i][$a]);
157 }
158 }
159 printf(OUT "\t%i\n", $neffs[$i]);
160 }
161
162 printf(OUT "//\n");
163 close OUT;
164
165 exit;
166
167