Mercurial > repos > hammock > hammock
view external_tools/darwin/lib/hh/scripts/hhblitsdb.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl # # hhblits.pl # Creates HH-suite database files from A3M and HHM/HMMER-formatted files # Usage: Usage: perl hhblitsdb.pl -o <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>] [more_options] # # HHsuite version 2.0.16 (Sept 2012) # # Reference: # Remmert M., Biegert A., Hauser A., and Soding J. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). # (C) Johannes Soeding, 2012 # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de use lib $ENV{"HHLIB"}."/scripts"; use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. use strict; #use File::Glob 'bsd_glob'; # splits patterns delimited by spaces into multiple patterns and applies them using OR $|= 1; # Activate autoflushing on STDOUT # Default values: our $v=2; # verbose mode my $a_if_append = ""; # do not append by default (default: create new db) my $remove = 0; # do not remove by default (default: create new db) my $hhmext = "hhm"; # default HHM-file extension my $csext = "seq219"; # default HHM-file extension my $cpu = 8; # Variable declarations my $line; my $command; my $a3mdir = ""; # name of input A3M directory my $hhmdir = ""; # name of input HHM/HMM directory my $csdir = ""; # name of input cs directory my $a3mfile = ""; # name of packed ouput A3M file my $hhmfile = ""; # name of packed ouput HHM file my $csfile = ""; # name of cs sequence db file my $dbname = ""; # output db name my $logfile = "/dev/null"; # log file my $file; my $numcsfiles= 0; my $num_chars = 0; my $numa3mfiles=0; my $numhhmfiles=0; my $fileglob=""; my $help=" hhblitsdb.pl from HHsuite $VERSION Builds HH-suite database from a3m formatted MSAs and/or from HMMs (-o). MSAs and HMMs can also be added (-a) to or removed (-r) from an existing database. Usage: hhblitsdb.pl -o|-a|-r <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>]... With option -o, the following HH-suite database files can be generated: <db_name>.cs219 column-state sequences, one for each MSA/HMM (for prefilter) <db_name>.cs219.sizes number of sequences and characters in <db_name>.cs219 <db_name>_a3m_db packed file containing A3M alignments read from <a3m_dir> <db_name>_a3m_db.index index file for packed A3M file <db_name>_a3m.db.index.sizes number of lines in <db_name>_a3m_db.index <db_name>_hhm_db packed file containing HHM-formatted HMMs read from <hhm_dir> <db_name>_hhm_db.index index file for packed HHM file <db_name>_hhm_db.index.sizes number of lines in <db_name>_hhm_db.index Options: -o <db_name> create database with this name -a <db_name> append files to database with this name -r <db_name> remove files from database with this name -ia3m <a3m_dir> input directory (or glob of directories) with A3M-formatted files These files MUST have extension 'a3m'. -ihhm <hhm_dir> input directory (or glob of directories) with HHM (or HMMER) files These files MUST have extension 'hhm' (HHsuite) or 'hmm' (HMMER3). -ics <cs_dir> input directory (or glob of directories) with column state sequences -log <logfile> log file recording stderr stream of cstranslate and hhmake commands -csext <ext> extension of column state sequences (default: $csext) -hmm use HMMER-formatted files. These MUST have extension hmm (WARNING! HMMER format results in decreased performance over HHM format) -v [1-3] verbose mode (default: $v) -cpu <int> number of threads to generate cs219 and hhm files (default = $cpu) -f 'file_glob' string with list of glob expression of files to remove Example 1: only -ia3m given; cs sequences and hhm files are generated from a3m files perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ Example 2: only -ihhm given; cs sequences are generated from hhm files, but no a3m db file perl hhblitsdb.pl -o databases/mydb -ihhm mydb/hhms/ Example 3: -ia3m and -ihhm given; cs sequences are generated from a3m files perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/ Example 4: -ics, -ia3m, and -ihhm given; all db files are created perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/ -ics mydb/cs/ Example 5: using glob expression to specify files (note the singe quotes) perl hhblitsdb.pl -o databases/mydb -ihhm 'mydbs*/hhms/*.hhm' Example 6: add files to database; cs sequences and hhm files are generated from a3m files perl hhblitsdb.pl -a databases/mydb -ia3m 'mydbs/a3ms/g1a*.a3m' Example 7: remove files from database perl hhblitsdb.pl -r databases/mydb -f 'mydbs/a3ms/g1a*.* mydbs2/' \n"; ############################################################################################### # Processing command line input ############################################################################################### if (@ARGV<1) {die ($help);} for (my $i=0; $i<@ARGV; $i++) { if ($ARGV[$i] eq "-ics") { if (++$i<@ARGV) { $csdir=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing directory after -ics option!\n"); } } elsif ($ARGV[$i] eq "-ia3m") { if (++$i<@ARGV) { $a3mdir=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing directory after -ia3m option!\n"); } } elsif ($ARGV[$i] eq "-ihhm") { if (++$i<@ARGV) { $hhmdir=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing directory after -ihhm option!\n"); } } elsif ($ARGV[$i] eq "-log") { if (++$i<@ARGV) { $logfile=$ARGV[$i]; unlink $logfile; } else { die ("$help\n\nERROR! Missing filename after -log option!\n"); } } elsif ($ARGV[$i] eq "-csext") { if (++$i<@ARGV) { $csext=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing extension after -csext option!\n"); } } elsif ($ARGV[$i] eq "-hmm") { $hhmext="hmm"; print("\nWARNING! HMMER format results in decreased performance over HHM format. We recommend to generate hhm files directly from multiple sequence alignments using hmake.\n"); } elsif ($ARGV[$i] eq "-v") { if (++$i<@ARGV) { $v=$ARGV[$i]; } else { $v = 2; } } elsif ($ARGV[$i] eq "-cpu") { if (++$i<@ARGV) { $cpu=$ARGV[$i]; } } elsif ($ARGV[$i] eq "-f") { if (++$i<@ARGV) { $fileglob=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing expression after -f option!\n"); } } elsif ($ARGV[$i] eq "-r") { if (++$i<@ARGV) { if ($dbname!="") {die("$help\n\nERROR! options -o and -r not compatible!\n");} $dbname=$ARGV[$i]; $remove=1; } else { die ("$help\n\nERROR! Missing filename after -o option!\n"); } } elsif ($ARGV[$i] eq "-a") { if (++$i<@ARGV) { if ($remove==1) {die("$help\n\nERROR! options -r and -a not compatible!\n");} if ($dbname!="") {die("$help\n\nERROR! options -o and -a not compatible!\n");} $dbname=$ARGV[$i]; $a_if_append="a"; } else { die ("$help\n\nERROR! Missing filename after -o option!\n"); } } elsif ($ARGV[$i] eq "-o") { if (++$i<@ARGV) { if ($remove==1) {die("$help\n\nERROR! options -r and -o not compatible!\n");} if ($a_if_append) {die("$help\n\nERROR! options -a and -o not compatible!\n");} $dbname=$ARGV[$i]; } else { die ("$help\n\nERROR! Missing filename after -o option!\n"); } } else { if ($dbname="") { $dbname=$ARGV[$i]; } else { print "WARNING! Unknown option $ARGV[$i]!\n"; } } } # Check input if (!$dbname) {print($help); die("ERROR! Name of database is missing! Use -o <db_name>\n");} $a3mdir=~s/\/$//; $a3mfile = $dbname."_a3m_db"; $hhmfile = $dbname."_hhm_db"; $csfile = $dbname.".cs219"; if ($hhmdir) {$hhmdir=~s/\/$//;} if ($a_if_append eq "" && $remove==0) { unlink $csfile, $a3mfile, $a3mfile.".index", $hhmfile, $hhmfile.".index"; } if ($a3mdir eq "" && $hhmdir eq "" && $csdir eq "" && $remove==0) { print($help); print "ERROR! At least one input directory must be given!\n"; exit(1); } # If $csdir is simple directory instead of glob expression, turn it into glob expression if ($csdir) { if ($csdir !~ /\*/ && $csdir !~ /\?/ && $csdir !~ / /) { $csdir .= "/*.".$csext; } } if ($a3mdir) { if ($a3mdir !~ /\*/ && $a3mdir !~ /\?/ && $a3mdir !~ / /) { $a3mdir .= "/*.a3m"; } } if ($hhmdir) { if ($hhmdir !~ /\*/ && $hhmdir !~ /\?/ && $hhmdir !~ / /) { $hhmdir .= "/*.".$hhmext; } } # If in append mode, initialize size counters with present sizes if ($a_if_append || $remove==1) { open (IN, "<$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!"); $line = <IN>; close IN; $line =~ /^(\S*)/; $numa3mfiles = $1; open (IN, "<$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!"); $line = <IN>; close IN; $line =~ /^(\S*)/; $numhhmfiles = $1; open (IN, "<$csfile.sizes") || die("Error: can't open $csfile.sizes: $!"); $line = <IN>; close IN; $line =~ /(\S*)\s+(\S*)/; $numcsfiles = $1; $num_chars = $2; printf("Current number of a3m files in db: %i\n",$numa3mfiles); printf("Current number of $hhmext files in db: %i\n",$numhhmfiles); printf("Current number of $csext files in db: %i\n\n",$numcsfiles); } else { $numa3mfiles = 0; $numhhmfiles = 0; $numcsfiles = 0; $num_chars = 0; } # Create tmp directory (plus path, if necessary) my $tmpdir="/tmp/$ENV{USER}/$$"; # directory where all temporary files are written: /tmp/UID/PID my $suffix=$tmpdir; while ($suffix=~s/^\/[^\/]+//) { $tmpdir=~/(.*)$suffix/; if (!-d $1) {mkdir($1,0777);} } unlink glob("$tmpdir/*"); # clean up directory if it already exists unlink $logfile; ############################################################################################## # Remove files? ############################################################################################## if ($remove==1) { printf("Removing files from indices...\n"); # Read numbers of sequences and characters in csfile open (IN, "<$csfile.sizes"); $line = <IN>; close IN; $line =~ /(\S*)\s+(\S*)/; $numcsfiles = $1; $num_chars = $2; # Remove names from a3m and hhm index files my $files = " ".join(" ", glob($fileglob)); $files =~ s/\S*\///g; &HHPaths::System("ffindex_modify -su $dbname"."_a3m_db.index ".$files); &HHPaths::System("ffindex_modify -su $dbname"."_hhm_db.index ".$files); # Adjust number of files in $a3mfile.index.sizes $numa3mfiles = 0; open (IN, "<$a3mfile.index") || die("Error: can't open $a3mfile.index: $!"); while(<IN>) {$numa3mfiles++;} close IN; open (OUT, ">$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!"); printf(OUT "%i\n",$numa3mfiles); close(OUT); # Adjust number of files in $hhmfile.index.sizes $numhhmfiles = 0; open (IN, "<$hhmfile.index") || die("Error: can't open $hhmfile.index: $!"); while(<IN>) {$numhhmfiles++;} close IN; open (OUT, ">$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!"); printf(OUT "%i\n",$numa3mfiles); close(OUT); # Remove sequences of globbed files from cs file my $skipseq=0; $numcsfiles = 0; $num_chars = 0; open (IN, "<$csfile"); open (OUT, ">$csfile".".tmp"); foreach my $line (<IN>) { if ($line =~ /^>(\w*)/) { my $name = $1; if ($files =~ / $name\./) { # found name in list of globbed file names? $skipseq=1; } else { $skipseq=0; printf(OUT "%s",$line); $numcsfiles++; } } else { if (!$skipseq) { printf(OUT "%s",$line); $num_chars += length($line); } } } close(OUT); close(IN); unlink($csfile); &HHPaths::System("mv $csfile".".tmp ".$csfile); # Adjust $csfile.sizes open (OUT, ">$csfile.sizes"); print OUT "$numcsfiles $num_chars\n"; close OUT; } else { ############################################################################################## # Generate new db or append to old ############################################################################################## ############################################################################################## # Generate column-state database file ############################################################################################## # Generate column-state sequences in $tmpdir if no -ics directory given if (!$csdir) { my $x = 0.3; # parameters for cstranslate my $c = 4; # parameters for cstranslate if ($a3mdir) { print("Generating seq219 files in $tmpdir/ from a3m files $a3mdir\n\n"); $command = "$hhbin/cstranslate -i \$file -o $tmpdir/\$base.seq219 -D $context_lib -A $cs_lib -x $x -c $c 1>>$logfile 2>>$logfile"; &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu"); } elsif ($hhmdir) { if ($hhmext eq "hmm") { print("\nGenerating prf profile files in $tmpdir/ from hmm files $hhmdir/\n\n"); $command = "$hhscripts/create_profile_from_hmmer.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile"; &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu"); } else { # $hhmext eq "hhm" print("\nGenerating prf profile files in $tmpdir/ from hhm files $hhmdir/\n\n"); $command = "$hhscripts/create_profile_from_hhm.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile"; &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu"); } print("\nGenerating seq219 files in $tmpdir/ from prf files in $tmpdir/\n\n"); if ($hhmext eq "hmm") { $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib 1>>$logfile 2>>$logfile"; } else { # $hhmext eq "hhm" $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib -D $context_lib -x $x -c $c 1>>$logfile 2>>$logfile"; } &HHPaths::System("$hhscripts/multithread.pl '".$tmpdir."/*.prf' '$command' -cpu $cpu"); } $csdir = $tmpdir."/*.$csext"; } # Write columns state sequences into cs database file, # replace names in cs sequences with filenames: ">name+description" => ">filename" if ($a_if_append) { open (OUT, ">>$csfile"); } else { open (OUT, ">$csfile"); } foreach my $seq219file (glob($csdir)) { open (IN, "<$seq219file"); my @lines = <IN>; close(IN); $seq219file =~ s/.*?([^\/]*)\.$csext\s*/$1/ or die ("Error: $seq219file does not have the extension $csext!?\n"); foreach my $line (@lines) { if ($line =~ /^>/) { $line = ">".$seq219file."\n"; $numcsfiles++; } else { $num_chars += length($line); } printf(OUT "%s",$line); } } close(OUT); open (OUT, ">$csfile.sizes"); print OUT "$numcsfiles $num_chars\n"; close OUT; ############################################################################################## # Generate hhm files with hhmake from a3m files if no -ihhm directory given ############################################################################################## if (!$hhmdir) { if ($a3mdir) { print("\nGenerating hhm files in $tmpdir/ from a3m files $a3mdir/\n\n"); $command = "hhmake -i \$file -o $tmpdir/\$base.hhm 1>/dev/null 2>>$logfile"; &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu"); $hhmdir = $tmpdir."/*.$hhmext";; $numhhmfiles += scalar(glob("$hhmdir")); } } ############################################################################################## # Generate packed A3M and HMM files and index files ############################################################################################## # Generate packed A3M file and index file? if ($a3mdir ne "") { print "Creating packed A3M database file $a3mfile ...\n"; open (OUT, ">$tmpdir/a3m.filelist"); my @files = glob("$a3mdir"); $numa3mfiles += scalar(@files); foreach $file (@files) { print OUT "$file\n"; } close OUT; # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/ $command = "ffindex_build -".$a_if_append."s -f $tmpdir/a3m.filelist $a3mfile $a3mfile.index"; &HHPaths::System($command); open (OUT, ">$a3mfile.index.sizes"); print OUT "$numa3mfiles\n"; close OUT; } # Generate packed HHMM file and index file? if ($hhmdir ne "") { print "Creating packed HHM database file $hhmfile ...\n"; open (OUT, ">$tmpdir/hhm.filelist"); my @files = glob("$hhmdir"); $numhhmfiles += scalar(@files); foreach $file (@files) { print OUT "$file\n"; } close OUT; # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/ $command = "ffindex_build -".$a_if_append."s -f $tmpdir/hhm.filelist $hhmfile $hhmfile.index"; &HHPaths::System($command); open (OUT, ">$hhmfile.index.sizes"); print OUT "$numhhmfiles\n"; close OUT; } } # end if $remove==0 print("\n"); printf("New number of a3m files in db: %i\n",$numa3mfiles); printf("New number of $hhmext files in db: %i\n",$numhhmfiles); printf("New number of $csext files in db: %i\n\n",$numcsfiles); my $err=0; if ($numa3mfiles && $numhhmfiles && $numa3mfiles != $numhhmfiles) { print("************************************************************************** WARNING: Number of a3m files not equal to number of $hhmext files\n"); $err=1; } if ($numcsfiles && $numhhmfiles && $numcsfiles != $numhhmfiles) { print("************************************************************************** WARNING: Number of $csext files not equal to number of $hhmext files\n"); $err=1; } if ($numcsfiles && $numa3mfiles && $numcsfiles != $numa3mfiles) { print("************************************************************************** WARNING: Number of $csext files not equal to number of a3m files\n"); $err=1; } if ($err==1) {print("************************************************************************** $tmpdir will not be removed to check for missing files **************************************************************************\n");} elsif ($v<3) { $command = "rm -rf $tmpdir"; # &System($command); } wait; exit;