Mercurial > repos > hammock > hammock
diff external_tools/linux/lib/hh/scripts/create_profile_from_hmmer.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/external_tools/linux/lib/hh/scripts/create_profile_from_hmmer.pl Wed Nov 01 05:54:28 2017 -0400 @@ -0,0 +1,248 @@ +#!/usr/bin/env perl +# +# create_profile_from_hmmer.pl +# Create a profile (.prf) from a given HMMER/HMMER3 file + +# HHsuite version 2.0.16 (January 2013) +# +# Reference: +# Remmert M., Biegert A., Hauser A., and Soding J. +# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. +# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). + +# (C) Michael Remmert and Johannes Soeding, 2012 + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de + +use lib $ENV{"HHLIB"}."/scripts"; +use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. +use strict; + +$|= 1; # Activate autoflushing on STDOUT + +# Default values: +our $v=2; # verbose mode +my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER)) +my $scale = 1000; +my $log2 = log(2); + +my $help=" +create_profile_from_hmmer.pl from HHsuite $VERSION +Create a profile (.prf) from a given HMMER/HMMER3 file + +Usage: perl create_profile_from_hmmer.pl -i <infile> [-o <outfile>] + +Options: + -i <infile> Input file in HMMER/HMMER3 format + -o <outfile> Output file in prf-format (default: infile.prf) + + -v [0-5] verbose mode (default: $v) +\n"; + +# Variable declarations +my $line; +my $infile; +my $outfile; +my $i; +my $a; + +my @counts; # count profile (normalised to 1) +my $name; # name of HMMER profile +my $len; # length of HMMER profile +my @hmmer_prof; # HMMER profile + + # A C D E F G H I K L M N P Q R S T V W Y +my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18); +my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'); +my %aa2i; +for ($a = 0; $a < 20; $a++) { + $aa2i{$aminoacids[$a]} = $a; +} + +############################################################################################### +# Processing command line input +############################################################################################### + +if (@ARGV<1) {die ($help);} + +my $options=""; +for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} + +if ($options=~s/ -i\s+(\S+) //) {$infile=$1;} +if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;} + +if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;} + +if ($options=~s/ -v\s+(\S+) //) {$v=$1;} + +if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);} + +if (!$outfile) { + $infile =~ /^(\S+)\.\S+?$/; + $outfile = "$1.prf"; +} + +############################################################################################## +# Main part +############################################################################################## + +###################################### +# Read HMMER sequence and profile +###################################### +open (IN, $infile); +$line = <IN>; + +if ($line =~ /^HMMER3/) { + while ($line = <IN>) { + if ($line =~ /^NAME\s+(\S+)/) { + $name = $1; + } elsif ($line =~ /^LENG\s+(\d+)/) { + $len = $1; + } elsif ($line =~ /^HMM/) { + last; + } + } + $line = <IN>; $line = <IN>; # Skip header lines + if ($line =~ /^\s*COMPO/) { + $line = <IN>; $line = <IN>; # Skip null model lines + } + # Read profiles and query seq + $i = 0; + while ($line = <IN>) { + if ($line =~ /^\/\//) { last; } + + $line =~ s/^\s*\d+//; # sequence position + for ($a = 0; $a < 20; $a++) { + $line =~ s/^\s*(\S+)\s/ /; + $hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1); + } + + # Read query char in count profile + for ($a = 0; $a < 20; $a++) { + $counts[$i][$a] = 0; + } + $line =~ /^\s*\d+\s+(\S)/; + $counts[$i][$aa2i{$1}] = 1; + + $line = <IN>; $line = <IN>; + $i++; + + } +} elsif ($line =~ /^HMMER/) { + my @pb; + while ($line = <IN>) { + if ($line =~ /^NAME\s+(\S+)/) { + $name = $1; + } elsif ($line =~ /^LENG\s+(\d+)/) { + $len = $1; + } elsif ($line =~ /^NULE/) { + $line =~ s/^NULE//; # sequence position + for ($a = 0; $a < 20; $a++) { + $line =~ s/^\s*(\S+)\s/ /; + # pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE); + $pb[$a] = 0.05 * (2**($1/1000)); + } + } elsif ($line =~ /^HMM/) { + last; + } + } + + + $line = <IN>; $line = <IN>; # Skip header lines + # Read profiles and query seq + $i = 0; + while ($line = <IN>) { + if ($line =~ /^\/\//) { last; } + + $line =~ s/^\s*\d+//; # sequence position + for ($a = 0; $a < 20; $a++) { + $line =~ s/^\s*(\S+)\s/ /; + # prob = pb[a]*fpow2(float(ptr)/HMMSCALE); + $hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000)); + } + + # Read query char in count profile + $line = <IN>; + for ($a = 0; $a < 20; $a++) { + $counts[$i][$a] = 0; + } + $line =~ /^\s*(\S)/; + $counts[$i][$aa2i{$1}] = 1; + + $line = <IN>; + $i++; + } + +} else { + print($help); + print "ERROR! Unknown input format!\n"; + exit(1); +} +###################################### +# build count_profile (mix orig sequence and HMMER-profile) +###################################### +for ($i = 0; $i < $len; $i++) { + my $sum = 0; + for ($a = 0; $a < 20; $a++) { + $counts[$i][$a] += $factor * $hmmer_prof[$i][$a]; + $sum += $counts[$i][$a]; + } + # Normalize to one + my $fac = 1 / $sum; + for ($a = 0; $a < 20; $a++) { + $counts[$i][$a] *= $fac; + } +} + +###################################### +# write count_profile +###################################### + +open (OUT, ">$outfile"); +# Write header +printf(OUT "CountProfile\n"); +printf(OUT "NAME\t%s\n", $name); +printf(OUT "LENG\t%i\n", $len); +printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet +printf(OUT "COUNTS"); +for ($a = 0; $a < 20; $a++) { + printf(OUT "\t%s", $aminoacids[$a]); +} +printf(OUT "\tNEFF\n"); + +# Write profile +for ($i = 0; $i < $len; $i++) { + printf(OUT "%i", $i+1); + for ($a = 0; $a < 20; $a++) { + if ($counts[$i][$a] == 0) { + printf(OUT "\t*"); + } else { + printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5)); + } + } + printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1 +} + +printf(OUT "//\n"); +close OUT; + +exit; + +sub log2() +{ + my $n = shift; + return (log($n)/$log2); +}