comparison external_tools/linux/lib/hh/scripts/hhblitsdb.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
comparison
equal deleted inserted replaced
5:b7652b7c97bd 6:2277dd59b9f9
1 #!/usr/bin/env perl
2 #
3 # hhblits.pl
4 # Creates HH-suite database files from A3M and HHM/HMMER-formatted files
5 # Usage: Usage: perl hhblitsdb.pl -o <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>] [more_options]
6 #
7 # HHsuite version 2.0.16 (Sept 2012)
8 #
9 # Reference:
10 # Remmert M., Biegert A., Hauser A., and Soding J.
11 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
12 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
13
14 # (C) Johannes Soeding, 2012
15
16 # This program is free software: you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation, either version 3 of the License, or
19 # (at your option) any later version.
20
21 # This program is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details.
25
26 # You should have received a copy of the GNU General Public License
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28
29 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
30
31 use lib $ENV{"HHLIB"}."/scripts";
32 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
33 use strict;
34 #use File::Glob 'bsd_glob'; # splits patterns delimited by spaces into multiple patterns and applies them using OR
35
36 $|= 1; # Activate autoflushing on STDOUT
37
38 # Default values:
39 our $v=2; # verbose mode
40 my $a_if_append = ""; # do not append by default (default: create new db)
41 my $remove = 0; # do not remove by default (default: create new db)
42 my $hhmext = "hhm"; # default HHM-file extension
43 my $csext = "seq219"; # default HHM-file extension
44 my $cpu = 8;
45
46 # Variable declarations
47 my $line;
48 my $command;
49 my $a3mdir = ""; # name of input A3M directory
50 my $hhmdir = ""; # name of input HHM/HMM directory
51 my $csdir = ""; # name of input cs directory
52 my $a3mfile = ""; # name of packed ouput A3M file
53 my $hhmfile = ""; # name of packed ouput HHM file
54 my $csfile = ""; # name of cs sequence db file
55 my $dbname = ""; # output db name
56 my $logfile = "/dev/null"; # log file
57 my $file;
58 my $numcsfiles= 0;
59 my $num_chars = 0;
60 my $numa3mfiles=0;
61 my $numhhmfiles=0;
62 my $fileglob="";
63 my $help="
64 hhblitsdb.pl from HHsuite $VERSION
65 Builds HH-suite database from a3m formatted MSAs and/or from HMMs (-o).
66 MSAs and HMMs can also be added (-a) to or removed (-r) from an existing database.
67
68 Usage: hhblitsdb.pl -o|-a|-r <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>]...
69
70 With option -o, the following HH-suite database files can be generated:
71 <db_name>.cs219 column-state sequences, one for each MSA/HMM (for prefilter)
72 <db_name>.cs219.sizes number of sequences and characters in <db_name>.cs219
73 <db_name>_a3m_db packed file containing A3M alignments read from <a3m_dir>
74 <db_name>_a3m_db.index index file for packed A3M file
75 <db_name>_a3m.db.index.sizes number of lines in <db_name>_a3m_db.index
76 <db_name>_hhm_db packed file containing HHM-formatted HMMs read from <hhm_dir>
77 <db_name>_hhm_db.index index file for packed HHM file
78 <db_name>_hhm_db.index.sizes number of lines in <db_name>_hhm_db.index
79
80 Options:
81 -o <db_name> create database with this name
82 -a <db_name> append files to database with this name
83 -r <db_name> remove files from database with this name
84 -ia3m <a3m_dir> input directory (or glob of directories) with A3M-formatted files
85 These files MUST have extension 'a3m'.
86 -ihhm <hhm_dir> input directory (or glob of directories) with HHM (or HMMER) files
87 These files MUST have extension 'hhm' (HHsuite) or 'hmm' (HMMER3).
88 -ics <cs_dir> input directory (or glob of directories) with column state sequences
89 -log <logfile> log file recording stderr stream of cstranslate and hhmake commands
90 -csext <ext> extension of column state sequences (default: $csext)
91 -hmm use HMMER-formatted files. These MUST have extension hmm
92 (WARNING! HMMER format results in decreased performance over HHM format)
93 -v [1-3] verbose mode (default: $v)
94 -cpu <int> number of threads to generate cs219 and hhm files (default = $cpu)
95 -f 'file_glob' string with list of glob expression of files to remove
96
97 Example 1: only -ia3m given; cs sequences and hhm files are generated from a3m files
98 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/
99
100 Example 2: only -ihhm given; cs sequences are generated from hhm files, but no a3m db file
101 perl hhblitsdb.pl -o databases/mydb -ihhm mydb/hhms/
102
103 Example 3: -ia3m and -ihhm given; cs sequences are generated from a3m files
104 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/
105
106 Example 4: -ics, -ia3m, and -ihhm given; all db files are created
107 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/ -ics mydb/cs/
108
109 Example 5: using glob expression to specify files (note the singe quotes)
110 perl hhblitsdb.pl -o databases/mydb -ihhm 'mydbs*/hhms/*.hhm'
111
112 Example 6: add files to database; cs sequences and hhm files are generated from a3m files
113 perl hhblitsdb.pl -a databases/mydb -ia3m 'mydbs/a3ms/g1a*.a3m'
114
115 Example 7: remove files from database
116 perl hhblitsdb.pl -r databases/mydb -f 'mydbs/a3ms/g1a*.* mydbs2/'
117 \n";
118
119
120 ###############################################################################################
121 # Processing command line input
122 ###############################################################################################
123
124 if (@ARGV<1) {die ($help);}
125
126 for (my $i=0; $i<@ARGV; $i++) {
127 if ($ARGV[$i] eq "-ics") {
128 if (++$i<@ARGV) {
129 $csdir=$ARGV[$i];
130 } else {
131 die ("$help\n\nERROR! Missing directory after -ics option!\n");
132 }
133 } elsif ($ARGV[$i] eq "-ia3m") {
134 if (++$i<@ARGV) {
135 $a3mdir=$ARGV[$i];
136 } else {
137 die ("$help\n\nERROR! Missing directory after -ia3m option!\n");
138 }
139 } elsif ($ARGV[$i] eq "-ihhm") {
140 if (++$i<@ARGV) {
141 $hhmdir=$ARGV[$i];
142 } else {
143 die ("$help\n\nERROR! Missing directory after -ihhm option!\n");
144 }
145 } elsif ($ARGV[$i] eq "-log") {
146 if (++$i<@ARGV) {
147 $logfile=$ARGV[$i];
148 unlink $logfile;
149 } else {
150 die ("$help\n\nERROR! Missing filename after -log option!\n");
151 }
152 } elsif ($ARGV[$i] eq "-csext") {
153 if (++$i<@ARGV) {
154 $csext=$ARGV[$i];
155 } else {
156 die ("$help\n\nERROR! Missing extension after -csext option!\n");
157 }
158 } elsif ($ARGV[$i] eq "-hmm") {
159 $hhmext="hmm";
160 print("\nWARNING! HMMER format results in decreased performance over HHM format. We recommend to generate hhm files directly from multiple sequence alignments using hmake.\n");
161 } elsif ($ARGV[$i] eq "-v") {
162 if (++$i<@ARGV) {
163 $v=$ARGV[$i];
164 } else {
165 $v = 2;
166 }
167 } elsif ($ARGV[$i] eq "-cpu") {
168 if (++$i<@ARGV) {
169 $cpu=$ARGV[$i];
170 }
171 } elsif ($ARGV[$i] eq "-f") {
172 if (++$i<@ARGV) {
173 $fileglob=$ARGV[$i];
174 } else {
175 die ("$help\n\nERROR! Missing expression after -f option!\n");
176 }
177 } elsif ($ARGV[$i] eq "-r") {
178 if (++$i<@ARGV) {
179 if ($dbname!="") {die("$help\n\nERROR! options -o and -r not compatible!\n");}
180 $dbname=$ARGV[$i];
181 $remove=1;
182 } else {
183 die ("$help\n\nERROR! Missing filename after -o option!\n");
184 }
185 } elsif ($ARGV[$i] eq "-a") {
186 if (++$i<@ARGV) {
187 if ($remove==1) {die("$help\n\nERROR! options -r and -a not compatible!\n");}
188 if ($dbname!="") {die("$help\n\nERROR! options -o and -a not compatible!\n");}
189 $dbname=$ARGV[$i];
190 $a_if_append="a";
191 } else {
192 die ("$help\n\nERROR! Missing filename after -o option!\n");
193 }
194 } elsif ($ARGV[$i] eq "-o") {
195 if (++$i<@ARGV) {
196 if ($remove==1) {die("$help\n\nERROR! options -r and -o not compatible!\n");}
197 if ($a_if_append) {die("$help\n\nERROR! options -a and -o not compatible!\n");}
198 $dbname=$ARGV[$i];
199 } else {
200 die ("$help\n\nERROR! Missing filename after -o option!\n");
201 }
202 } else {
203 if ($dbname="") {
204 $dbname=$ARGV[$i];
205 } else {
206 print "WARNING! Unknown option $ARGV[$i]!\n";
207 }
208 }
209 }
210
211 # Check input
212 if (!$dbname) {print($help); die("ERROR! Name of database is missing! Use -o <db_name>\n");}
213 $a3mdir=~s/\/$//;
214 $a3mfile = $dbname."_a3m_db";
215 $hhmfile = $dbname."_hhm_db";
216 $csfile = $dbname.".cs219";
217 if ($hhmdir) {$hhmdir=~s/\/$//;}
218
219 if ($a_if_append eq "" && $remove==0) {
220 unlink $csfile, $a3mfile, $a3mfile.".index", $hhmfile, $hhmfile.".index";
221 }
222
223 if ($a3mdir eq "" && $hhmdir eq "" && $csdir eq "" && $remove==0) {
224 print($help); print "ERROR! At least one input directory must be given!\n"; exit(1);
225 }
226
227 # If $csdir is simple directory instead of glob expression, turn it into glob expression
228 if ($csdir) {
229 if ($csdir !~ /\*/ && $csdir !~ /\?/ && $csdir !~ / /) {
230 $csdir .= "/*.".$csext;
231 }
232 }
233 if ($a3mdir) {
234 if ($a3mdir !~ /\*/ && $a3mdir !~ /\?/ && $a3mdir !~ / /) {
235 $a3mdir .= "/*.a3m";
236 }
237 }
238 if ($hhmdir) {
239 if ($hhmdir !~ /\*/ && $hhmdir !~ /\?/ && $hhmdir !~ / /) {
240 $hhmdir .= "/*.".$hhmext;
241 }
242 }
243
244
245 # If in append mode, initialize size counters with present sizes
246 if ($a_if_append || $remove==1) {
247 open (IN, "<$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!");
248 $line = <IN>;
249 close IN;
250 $line =~ /^(\S*)/;
251 $numa3mfiles = $1;
252
253 open (IN, "<$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!");
254 $line = <IN>;
255 close IN;
256 $line =~ /^(\S*)/;
257 $numhhmfiles = $1;
258
259 open (IN, "<$csfile.sizes") || die("Error: can't open $csfile.sizes: $!");
260 $line = <IN>;
261 close IN;
262 $line =~ /(\S*)\s+(\S*)/;
263 $numcsfiles = $1;
264 $num_chars = $2;
265
266 printf("Current number of a3m files in db: %i\n",$numa3mfiles);
267 printf("Current number of $hhmext files in db: %i\n",$numhhmfiles);
268 printf("Current number of $csext files in db: %i\n\n",$numcsfiles);
269
270 } else {
271 $numa3mfiles = 0;
272 $numhhmfiles = 0;
273 $numcsfiles = 0;
274 $num_chars = 0;
275 }
276
277 # Create tmp directory (plus path, if necessary)
278 my $tmpdir="/tmp/$ENV{USER}/$$"; # directory where all temporary files are written: /tmp/UID/PID
279 my $suffix=$tmpdir;
280 while ($suffix=~s/^\/[^\/]+//) {
281 $tmpdir=~/(.*)$suffix/;
282 if (!-d $1) {mkdir($1,0777);}
283 }
284 unlink glob("$tmpdir/*"); # clean up directory if it already exists
285 unlink $logfile;
286
287
288
289 ##############################################################################################
290 # Remove files?
291 ##############################################################################################
292 if ($remove==1) {
293
294 printf("Removing files from indices...\n");
295
296 # Read numbers of sequences and characters in csfile
297 open (IN, "<$csfile.sizes");
298 $line = <IN>;
299 close IN;
300 $line =~ /(\S*)\s+(\S*)/;
301 $numcsfiles = $1;
302 $num_chars = $2;
303
304 # Remove names from a3m and hhm index files
305 my $files = " ".join(" ", glob($fileglob));
306 $files =~ s/\S*\///g;
307 &HHPaths::System("ffindex_modify -su $dbname"."_a3m_db.index ".$files);
308 &HHPaths::System("ffindex_modify -su $dbname"."_hhm_db.index ".$files);
309
310 # Adjust number of files in $a3mfile.index.sizes
311 $numa3mfiles = 0;
312 open (IN, "<$a3mfile.index") || die("Error: can't open $a3mfile.index: $!");
313 while(<IN>) {$numa3mfiles++;}
314 close IN;
315 open (OUT, ">$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!");
316 printf(OUT "%i\n",$numa3mfiles);
317 close(OUT);
318
319 # Adjust number of files in $hhmfile.index.sizes
320 $numhhmfiles = 0;
321 open (IN, "<$hhmfile.index") || die("Error: can't open $hhmfile.index: $!");
322 while(<IN>) {$numhhmfiles++;}
323 close IN;
324 open (OUT, ">$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!");
325 printf(OUT "%i\n",$numa3mfiles);
326 close(OUT);
327
328
329 # Remove sequences of globbed files from cs file
330 my $skipseq=0;
331 $numcsfiles = 0;
332 $num_chars = 0;
333 open (IN, "<$csfile");
334 open (OUT, ">$csfile".".tmp");
335 foreach my $line (<IN>) {
336 if ($line =~ /^>(\w*)/) {
337 my $name = $1;
338 if ($files =~ / $name\./) { # found name in list of globbed file names?
339 $skipseq=1;
340 } else {
341 $skipseq=0;
342 printf(OUT "%s",$line);
343 $numcsfiles++;
344 }
345 } else {
346 if (!$skipseq) {
347 printf(OUT "%s",$line);
348 $num_chars += length($line);
349 }
350 }
351 }
352 close(OUT);
353 close(IN);
354 unlink($csfile);
355 &HHPaths::System("mv $csfile".".tmp ".$csfile);
356
357 # Adjust $csfile.sizes
358 open (OUT, ">$csfile.sizes");
359 print OUT "$numcsfiles $num_chars\n";
360 close OUT;
361
362 } else {
363
364 ##############################################################################################
365 # Generate new db or append to old
366 ##############################################################################################
367
368
369 ##############################################################################################
370 # Generate column-state database file
371 ##############################################################################################
372
373 # Generate column-state sequences in $tmpdir if no -ics directory given
374 if (!$csdir)
375 {
376 my $x = 0.3; # parameters for cstranslate
377 my $c = 4; # parameters for cstranslate
378
379 if ($a3mdir) {
380 print("Generating seq219 files in $tmpdir/ from a3m files $a3mdir\n\n");
381 $command = "$hhbin/cstranslate -i \$file -o $tmpdir/\$base.seq219 -D $context_lib -A $cs_lib -x $x -c $c 1>>$logfile 2>>$logfile";
382 &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu");
383
384 } elsif ($hhmdir) {
385
386 if ($hhmext eq "hmm") {
387 print("\nGenerating prf profile files in $tmpdir/ from hmm files $hhmdir/\n\n");
388 $command = "$hhscripts/create_profile_from_hmmer.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile";
389 &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu");
390 } else { # $hhmext eq "hhm"
391 print("\nGenerating prf profile files in $tmpdir/ from hhm files $hhmdir/\n\n");
392 $command = "$hhscripts/create_profile_from_hhm.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile";
393 &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu");
394 }
395
396 print("\nGenerating seq219 files in $tmpdir/ from prf files in $tmpdir/\n\n");
397 if ($hhmext eq "hmm") {
398 $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib 1>>$logfile 2>>$logfile";
399 } else { # $hhmext eq "hhm"
400 $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib -D $context_lib -x $x -c $c 1>>$logfile 2>>$logfile";
401 }
402 &HHPaths::System("$hhscripts/multithread.pl '".$tmpdir."/*.prf' '$command' -cpu $cpu");
403 }
404
405 $csdir = $tmpdir."/*.$csext";
406 }
407
408
409 # Write columns state sequences into cs database file,
410 # replace names in cs sequences with filenames: ">name+description" => ">filename"
411 if ($a_if_append) {
412 open (OUT, ">>$csfile");
413 } else {
414 open (OUT, ">$csfile");
415 }
416 foreach my $seq219file (glob($csdir)) {
417 open (IN, "<$seq219file");
418 my @lines = <IN>;
419 close(IN);
420 $seq219file =~ s/.*?([^\/]*)\.$csext\s*/$1/ or die ("Error: $seq219file does not have the extension $csext!?\n");
421 foreach my $line (@lines) {
422 if ($line =~ /^>/) {
423 $line = ">".$seq219file."\n";
424 $numcsfiles++;
425 } else {
426 $num_chars += length($line);
427 }
428 printf(OUT "%s",$line);
429 }
430 }
431 close(OUT);
432
433 open (OUT, ">$csfile.sizes");
434 print OUT "$numcsfiles $num_chars\n";
435 close OUT;
436
437
438 ##############################################################################################
439 # Generate hhm files with hhmake from a3m files if no -ihhm directory given
440 ##############################################################################################
441
442 if (!$hhmdir)
443 {
444 if ($a3mdir) {
445 print("\nGenerating hhm files in $tmpdir/ from a3m files $a3mdir/\n\n");
446 $command = "hhmake -i \$file -o $tmpdir/\$base.hhm 1>/dev/null 2>>$logfile";
447 &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu");
448 $hhmdir = $tmpdir."/*.$hhmext";;
449 $numhhmfiles += scalar(glob("$hhmdir"));
450 }
451 }
452
453
454 ##############################################################################################
455 # Generate packed A3M and HMM files and index files
456 ##############################################################################################
457
458 # Generate packed A3M file and index file?
459 if ($a3mdir ne "") {
460 print "Creating packed A3M database file $a3mfile ...\n";
461
462 open (OUT, ">$tmpdir/a3m.filelist");
463 my @files = glob("$a3mdir");
464 $numa3mfiles += scalar(@files);
465 foreach $file (@files) {
466 print OUT "$file\n";
467 }
468 close OUT;
469
470 # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list
471 # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/
472 $command = "ffindex_build -".$a_if_append."s -f $tmpdir/a3m.filelist $a3mfile $a3mfile.index";
473 &HHPaths::System($command);
474
475 open (OUT, ">$a3mfile.index.sizes");
476 print OUT "$numa3mfiles\n";
477 close OUT;
478 }
479
480 # Generate packed HHMM file and index file?
481 if ($hhmdir ne "") {
482 print "Creating packed HHM database file $hhmfile ...\n";
483
484 open (OUT, ">$tmpdir/hhm.filelist");
485 my @files = glob("$hhmdir");
486 $numhhmfiles += scalar(@files);
487 foreach $file (@files) {
488 print OUT "$file\n";
489 }
490 close OUT;
491
492 # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list
493 # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/
494 $command = "ffindex_build -".$a_if_append."s -f $tmpdir/hhm.filelist $hhmfile $hhmfile.index";
495 &HHPaths::System($command);
496
497 open (OUT, ">$hhmfile.index.sizes");
498 print OUT "$numhhmfiles\n";
499 close OUT;
500 }
501 } # end if $remove==0
502
503 print("\n");
504 printf("New number of a3m files in db: %i\n",$numa3mfiles);
505 printf("New number of $hhmext files in db: %i\n",$numhhmfiles);
506 printf("New number of $csext files in db: %i\n\n",$numcsfiles);
507
508 my $err=0;
509 if ($numa3mfiles && $numhhmfiles && $numa3mfiles != $numhhmfiles) {
510 print("**************************************************************************
511 WARNING: Number of a3m files not equal to number of $hhmext files\n"); $err=1;
512 }
513 if ($numcsfiles && $numhhmfiles && $numcsfiles != $numhhmfiles) {
514 print("**************************************************************************
515 WARNING: Number of $csext files not equal to number of $hhmext files\n"); $err=1;
516 }
517 if ($numcsfiles && $numa3mfiles && $numcsfiles != $numa3mfiles) {
518 print("**************************************************************************
519 WARNING: Number of $csext files not equal to number of a3m files\n"); $err=1;
520 }
521
522 if ($err==1) {print("**************************************************************************
523 $tmpdir will not be removed to check for missing files
524 **************************************************************************\n");}
525 elsif ($v<3) {
526 $command = "rm -rf $tmpdir";
527 # &System($command);
528 }
529 wait;
530 exit;
531