annotate external_tools/linux/lib/hh/scripts/hhblitsdb.pl @ 9:afddbcbc8ee8 draft default tip

Uploaded
author hammock
date Mon, 11 Dec 2017 08:10:26 -0500
parents 2277dd59b9f9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
1 #!/usr/bin/env perl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
2 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
3 # hhblits.pl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
4 # Creates HH-suite database files from A3M and HHM/HMMER-formatted files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
5 # Usage: Usage: perl hhblitsdb.pl -o <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>] [more_options]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
6 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
7 # HHsuite version 2.0.16 (Sept 2012)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
8 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
9 # Reference:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
10 # Remmert M., Biegert A., Hauser A., and Soding J.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
11 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
12 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
13
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
14 # (C) Johannes Soeding, 2012
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
15
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
16 # This program is free software: you can redistribute it and/or modify
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
17 # it under the terms of the GNU General Public License as published by
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
18 # the Free Software Foundation, either version 3 of the License, or
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
19 # (at your option) any later version.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
20
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
21 # This program is distributed in the hope that it will be useful,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
24 # GNU General Public License for more details.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
25
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
26 # You should have received a copy of the GNU General Public License
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
28
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
29 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
30
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
31 use lib $ENV{"HHLIB"}."/scripts";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
32 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
33 use strict;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
34 #use File::Glob 'bsd_glob'; # splits patterns delimited by spaces into multiple patterns and applies them using OR
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
35
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
36 $|= 1; # Activate autoflushing on STDOUT
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
37
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
38 # Default values:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
39 our $v=2; # verbose mode
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
40 my $a_if_append = ""; # do not append by default (default: create new db)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
41 my $remove = 0; # do not remove by default (default: create new db)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
42 my $hhmext = "hhm"; # default HHM-file extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
43 my $csext = "seq219"; # default HHM-file extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
44 my $cpu = 8;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
45
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
46 # Variable declarations
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
47 my $line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
48 my $command;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
49 my $a3mdir = ""; # name of input A3M directory
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
50 my $hhmdir = ""; # name of input HHM/HMM directory
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
51 my $csdir = ""; # name of input cs directory
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
52 my $a3mfile = ""; # name of packed ouput A3M file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
53 my $hhmfile = ""; # name of packed ouput HHM file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
54 my $csfile = ""; # name of cs sequence db file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
55 my $dbname = ""; # output db name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
56 my $logfile = "/dev/null"; # log file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
57 my $file;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
58 my $numcsfiles= 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
59 my $num_chars = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
60 my $numa3mfiles=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
61 my $numhhmfiles=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
62 my $fileglob="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
63 my $help="
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
64 hhblitsdb.pl from HHsuite $VERSION
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
65 Builds HH-suite database from a3m formatted MSAs and/or from HMMs (-o).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
66 MSAs and HMMs can also be added (-a) to or removed (-r) from an existing database.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
67
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
68 Usage: hhblitsdb.pl -o|-a|-r <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>]...
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
69
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
70 With option -o, the following HH-suite database files can be generated:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
71 <db_name>.cs219 column-state sequences, one for each MSA/HMM (for prefilter)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
72 <db_name>.cs219.sizes number of sequences and characters in <db_name>.cs219
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
73 <db_name>_a3m_db packed file containing A3M alignments read from <a3m_dir>
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
74 <db_name>_a3m_db.index index file for packed A3M file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
75 <db_name>_a3m.db.index.sizes number of lines in <db_name>_a3m_db.index
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
76 <db_name>_hhm_db packed file containing HHM-formatted HMMs read from <hhm_dir>
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
77 <db_name>_hhm_db.index index file for packed HHM file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
78 <db_name>_hhm_db.index.sizes number of lines in <db_name>_hhm_db.index
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
79
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
80 Options:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
81 -o <db_name> create database with this name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
82 -a <db_name> append files to database with this name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
83 -r <db_name> remove files from database with this name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
84 -ia3m <a3m_dir> input directory (or glob of directories) with A3M-formatted files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
85 These files MUST have extension 'a3m'.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
86 -ihhm <hhm_dir> input directory (or glob of directories) with HHM (or HMMER) files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
87 These files MUST have extension 'hhm' (HHsuite) or 'hmm' (HMMER3).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
88 -ics <cs_dir> input directory (or glob of directories) with column state sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
89 -log <logfile> log file recording stderr stream of cstranslate and hhmake commands
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
90 -csext <ext> extension of column state sequences (default: $csext)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
91 -hmm use HMMER-formatted files. These MUST have extension hmm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
92 (WARNING! HMMER format results in decreased performance over HHM format)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
93 -v [1-3] verbose mode (default: $v)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
94 -cpu <int> number of threads to generate cs219 and hhm files (default = $cpu)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
95 -f 'file_glob' string with list of glob expression of files to remove
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
96
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
97 Example 1: only -ia3m given; cs sequences and hhm files are generated from a3m files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
98 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
99
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
100 Example 2: only -ihhm given; cs sequences are generated from hhm files, but no a3m db file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
101 perl hhblitsdb.pl -o databases/mydb -ihhm mydb/hhms/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
102
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
103 Example 3: -ia3m and -ihhm given; cs sequences are generated from a3m files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
104 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
105
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
106 Example 4: -ics, -ia3m, and -ihhm given; all db files are created
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
107 perl hhblitsdb.pl -o databases/mydb -ia3m mydb/a3ms/ -ihhm mydb/hhms/ -ics mydb/cs/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
108
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
109 Example 5: using glob expression to specify files (note the singe quotes)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
110 perl hhblitsdb.pl -o databases/mydb -ihhm 'mydbs*/hhms/*.hhm'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
111
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
112 Example 6: add files to database; cs sequences and hhm files are generated from a3m files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
113 perl hhblitsdb.pl -a databases/mydb -ia3m 'mydbs/a3ms/g1a*.a3m'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
114
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
115 Example 7: remove files from database
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
116 perl hhblitsdb.pl -r databases/mydb -f 'mydbs/a3ms/g1a*.* mydbs2/'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
117 \n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
118
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
119
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
120 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
121 # Processing command line input
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
122 ###############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
123
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
124 if (@ARGV<1) {die ($help);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
125
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
126 for (my $i=0; $i<@ARGV; $i++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
127 if ($ARGV[$i] eq "-ics") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
128 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
129 $csdir=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
130 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
131 die ("$help\n\nERROR! Missing directory after -ics option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
132 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
133 } elsif ($ARGV[$i] eq "-ia3m") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
134 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
135 $a3mdir=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
136 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
137 die ("$help\n\nERROR! Missing directory after -ia3m option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
138 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
139 } elsif ($ARGV[$i] eq "-ihhm") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
140 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
141 $hhmdir=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
142 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
143 die ("$help\n\nERROR! Missing directory after -ihhm option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
144 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
145 } elsif ($ARGV[$i] eq "-log") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
146 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
147 $logfile=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
148 unlink $logfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
149 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
150 die ("$help\n\nERROR! Missing filename after -log option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
151 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
152 } elsif ($ARGV[$i] eq "-csext") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
153 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
154 $csext=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
155 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
156 die ("$help\n\nERROR! Missing extension after -csext option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
157 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
158 } elsif ($ARGV[$i] eq "-hmm") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
159 $hhmext="hmm";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
160 print("\nWARNING! HMMER format results in decreased performance over HHM format. We recommend to generate hhm files directly from multiple sequence alignments using hmake.\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
161 } elsif ($ARGV[$i] eq "-v") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
162 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
163 $v=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
164 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
165 $v = 2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
166 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
167 } elsif ($ARGV[$i] eq "-cpu") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
168 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
169 $cpu=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
170 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
171 } elsif ($ARGV[$i] eq "-f") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
172 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
173 $fileglob=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
174 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
175 die ("$help\n\nERROR! Missing expression after -f option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
176 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
177 } elsif ($ARGV[$i] eq "-r") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
178 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
179 if ($dbname!="") {die("$help\n\nERROR! options -o and -r not compatible!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
180 $dbname=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
181 $remove=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
182 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
183 die ("$help\n\nERROR! Missing filename after -o option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
184 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
185 } elsif ($ARGV[$i] eq "-a") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
186 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
187 if ($remove==1) {die("$help\n\nERROR! options -r and -a not compatible!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
188 if ($dbname!="") {die("$help\n\nERROR! options -o and -a not compatible!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
189 $dbname=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
190 $a_if_append="a";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
191 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
192 die ("$help\n\nERROR! Missing filename after -o option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
193 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
194 } elsif ($ARGV[$i] eq "-o") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
195 if (++$i<@ARGV) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
196 if ($remove==1) {die("$help\n\nERROR! options -r and -o not compatible!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
197 if ($a_if_append) {die("$help\n\nERROR! options -a and -o not compatible!\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
198 $dbname=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
199 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
200 die ("$help\n\nERROR! Missing filename after -o option!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
201 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
202 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
203 if ($dbname="") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
204 $dbname=$ARGV[$i];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
205 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
206 print "WARNING! Unknown option $ARGV[$i]!\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
207 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
208 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
209 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
210
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
211 # Check input
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
212 if (!$dbname) {print($help); die("ERROR! Name of database is missing! Use -o <db_name>\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
213 $a3mdir=~s/\/$//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
214 $a3mfile = $dbname."_a3m_db";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
215 $hhmfile = $dbname."_hhm_db";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
216 $csfile = $dbname.".cs219";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
217 if ($hhmdir) {$hhmdir=~s/\/$//;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
218
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
219 if ($a_if_append eq "" && $remove==0) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
220 unlink $csfile, $a3mfile, $a3mfile.".index", $hhmfile, $hhmfile.".index";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
221 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
222
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
223 if ($a3mdir eq "" && $hhmdir eq "" && $csdir eq "" && $remove==0) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
224 print($help); print "ERROR! At least one input directory must be given!\n"; exit(1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
225 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
226
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
227 # If $csdir is simple directory instead of glob expression, turn it into glob expression
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
228 if ($csdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
229 if ($csdir !~ /\*/ && $csdir !~ /\?/ && $csdir !~ / /) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
230 $csdir .= "/*.".$csext;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
231 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
232 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
233 if ($a3mdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
234 if ($a3mdir !~ /\*/ && $a3mdir !~ /\?/ && $a3mdir !~ / /) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
235 $a3mdir .= "/*.a3m";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
236 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
237 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
238 if ($hhmdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
239 if ($hhmdir !~ /\*/ && $hhmdir !~ /\?/ && $hhmdir !~ / /) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
240 $hhmdir .= "/*.".$hhmext;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
241 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
242 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
243
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
244
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
245 # If in append mode, initialize size counters with present sizes
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
246 if ($a_if_append || $remove==1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
247 open (IN, "<$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
248 $line = <IN>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
249 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
250 $line =~ /^(\S*)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
251 $numa3mfiles = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
252
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
253 open (IN, "<$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
254 $line = <IN>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
255 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
256 $line =~ /^(\S*)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
257 $numhhmfiles = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
258
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
259 open (IN, "<$csfile.sizes") || die("Error: can't open $csfile.sizes: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
260 $line = <IN>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
261 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
262 $line =~ /(\S*)\s+(\S*)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
263 $numcsfiles = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
264 $num_chars = $2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
265
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
266 printf("Current number of a3m files in db: %i\n",$numa3mfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
267 printf("Current number of $hhmext files in db: %i\n",$numhhmfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
268 printf("Current number of $csext files in db: %i\n\n",$numcsfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
269
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
270 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
271 $numa3mfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
272 $numhhmfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
273 $numcsfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
274 $num_chars = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
275 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
276
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
277 # Create tmp directory (plus path, if necessary)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
278 my $tmpdir="/tmp/$ENV{USER}/$$"; # directory where all temporary files are written: /tmp/UID/PID
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
279 my $suffix=$tmpdir;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
280 while ($suffix=~s/^\/[^\/]+//) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
281 $tmpdir=~/(.*)$suffix/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
282 if (!-d $1) {mkdir($1,0777);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
283 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
284 unlink glob("$tmpdir/*"); # clean up directory if it already exists
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
285 unlink $logfile;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
286
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
287
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
288
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
289 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
290 # Remove files?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
291 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
292 if ($remove==1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
293
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
294 printf("Removing files from indices...\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
295
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
296 # Read numbers of sequences and characters in csfile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
297 open (IN, "<$csfile.sizes");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
298 $line = <IN>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
299 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
300 $line =~ /(\S*)\s+(\S*)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
301 $numcsfiles = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
302 $num_chars = $2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
303
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
304 # Remove names from a3m and hhm index files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
305 my $files = " ".join(" ", glob($fileglob));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
306 $files =~ s/\S*\///g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
307 &HHPaths::System("ffindex_modify -su $dbname"."_a3m_db.index ".$files);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
308 &HHPaths::System("ffindex_modify -su $dbname"."_hhm_db.index ".$files);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
309
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
310 # Adjust number of files in $a3mfile.index.sizes
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
311 $numa3mfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
312 open (IN, "<$a3mfile.index") || die("Error: can't open $a3mfile.index: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
313 while(<IN>) {$numa3mfiles++;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
314 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
315 open (OUT, ">$a3mfile.index.sizes") || die("Error: can't open $a3mfile.index.sizes: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
316 printf(OUT "%i\n",$numa3mfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
317 close(OUT);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
318
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
319 # Adjust number of files in $hhmfile.index.sizes
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
320 $numhhmfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
321 open (IN, "<$hhmfile.index") || die("Error: can't open $hhmfile.index: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
322 while(<IN>) {$numhhmfiles++;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
323 close IN;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
324 open (OUT, ">$hhmfile.index.sizes") || die("Error: can't open $hhmfile.index.sizes: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
325 printf(OUT "%i\n",$numa3mfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
326 close(OUT);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
327
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
328
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
329 # Remove sequences of globbed files from cs file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
330 my $skipseq=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
331 $numcsfiles = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
332 $num_chars = 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
333 open (IN, "<$csfile");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
334 open (OUT, ">$csfile".".tmp");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
335 foreach my $line (<IN>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
336 if ($line =~ /^>(\w*)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
337 my $name = $1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
338 if ($files =~ / $name\./) { # found name in list of globbed file names?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
339 $skipseq=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
340 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
341 $skipseq=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
342 printf(OUT "%s",$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
343 $numcsfiles++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
344 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
345 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
346 if (!$skipseq) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
347 printf(OUT "%s",$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
348 $num_chars += length($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
349 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
350 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
351 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
352 close(OUT);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
353 close(IN);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
354 unlink($csfile);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
355 &HHPaths::System("mv $csfile".".tmp ".$csfile);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
356
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
357 # Adjust $csfile.sizes
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
358 open (OUT, ">$csfile.sizes");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
359 print OUT "$numcsfiles $num_chars\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
360 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
361
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
362 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
363
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
364 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
365 # Generate new db or append to old
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
366 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
367
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
368
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
369 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
370 # Generate column-state database file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
371 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
372
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
373 # Generate column-state sequences in $tmpdir if no -ics directory given
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
374 if (!$csdir)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
375 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
376 my $x = 0.3; # parameters for cstranslate
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
377 my $c = 4; # parameters for cstranslate
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
378
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
379 if ($a3mdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
380 print("Generating seq219 files in $tmpdir/ from a3m files $a3mdir\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
381 $command = "$hhbin/cstranslate -i \$file -o $tmpdir/\$base.seq219 -D $context_lib -A $cs_lib -x $x -c $c 1>>$logfile 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
382 &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
383
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
384 } elsif ($hhmdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
385
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
386 if ($hhmext eq "hmm") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
387 print("\nGenerating prf profile files in $tmpdir/ from hmm files $hhmdir/\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
388 $command = "$hhscripts/create_profile_from_hmmer.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
389 &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
390 } else { # $hhmext eq "hhm"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
391 print("\nGenerating prf profile files in $tmpdir/ from hhm files $hhmdir/\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
392 $command = "$hhscripts/create_profile_from_hhm.pl -i \$file -o $tmpdir/\$base.prf 1>/dev/null 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
393 &HHPaths::System("$hhscripts/multithread.pl '".$hhmdir."' '$command' -cpu $cpu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
394 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
395
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
396 print("\nGenerating seq219 files in $tmpdir/ from prf files in $tmpdir/\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
397 if ($hhmext eq "hmm") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
398 $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib 1>>$logfile 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
399 } else { # $hhmext eq "hhm"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
400 $command = "$hhbin/cstranslate -i \$file -o \$name.seq219 -A $cs_lib -D $context_lib -x $x -c $c 1>>$logfile 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
401 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
402 &HHPaths::System("$hhscripts/multithread.pl '".$tmpdir."/*.prf' '$command' -cpu $cpu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
403 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
404
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
405 $csdir = $tmpdir."/*.$csext";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
406 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
407
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
408
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
409 # Write columns state sequences into cs database file,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
410 # replace names in cs sequences with filenames: ">name+description" => ">filename"
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
411 if ($a_if_append) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
412 open (OUT, ">>$csfile");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
413 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
414 open (OUT, ">$csfile");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
415 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
416 foreach my $seq219file (glob($csdir)) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
417 open (IN, "<$seq219file");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
418 my @lines = <IN>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
419 close(IN);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
420 $seq219file =~ s/.*?([^\/]*)\.$csext\s*/$1/ or die ("Error: $seq219file does not have the extension $csext!?\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
421 foreach my $line (@lines) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
422 if ($line =~ /^>/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
423 $line = ">".$seq219file."\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
424 $numcsfiles++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
425 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
426 $num_chars += length($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
427 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
428 printf(OUT "%s",$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
429 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
430 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
431 close(OUT);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
432
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
433 open (OUT, ">$csfile.sizes");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
434 print OUT "$numcsfiles $num_chars\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
435 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
436
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
437
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
438 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
439 # Generate hhm files with hhmake from a3m files if no -ihhm directory given
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
440 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
441
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
442 if (!$hhmdir)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
443 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
444 if ($a3mdir) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
445 print("\nGenerating hhm files in $tmpdir/ from a3m files $a3mdir/\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
446 $command = "hhmake -i \$file -o $tmpdir/\$base.hhm 1>/dev/null 2>>$logfile";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
447 &HHPaths::System("$hhscripts/multithread.pl '".$a3mdir."' '$command' -cpu $cpu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
448 $hhmdir = $tmpdir."/*.$hhmext";;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
449 $numhhmfiles += scalar(glob("$hhmdir"));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
450 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
451 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
452
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
453
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
454 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
455 # Generate packed A3M and HMM files and index files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
456 ##############################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
457
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
458 # Generate packed A3M file and index file?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
459 if ($a3mdir ne "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
460 print "Creating packed A3M database file $a3mfile ...\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
461
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
462 open (OUT, ">$tmpdir/a3m.filelist");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
463 my @files = glob("$a3mdir");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
464 $numa3mfiles += scalar(@files);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
465 foreach $file (@files) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
466 print OUT "$file\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
467 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
468 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
469
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
470 # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
471 # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
472 $command = "ffindex_build -".$a_if_append."s -f $tmpdir/a3m.filelist $a3mfile $a3mfile.index";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
473 &HHPaths::System($command);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
474
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
475 open (OUT, ">$a3mfile.index.sizes");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
476 print OUT "$numa3mfiles\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
477 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
478 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
479
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
480 # Generate packed HHMM file and index file?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
481 if ($hhmdir ne "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
482 print "Creating packed HHM database file $hhmfile ...\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
483
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
484 open (OUT, ">$tmpdir/hhm.filelist");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
485 my @files = glob("$hhmdir");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
486 $numhhmfiles += scalar(@files);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
487 foreach $file (@files) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
488 print OUT "$file\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
489 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
490 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
491
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
492 # Build packed file (concatenated with '\0' as delimiters) and index file from files in file list
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
493 # The ffindex binaries are contained in <install_dir>/lib/ffindex/bin/
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
494 $command = "ffindex_build -".$a_if_append."s -f $tmpdir/hhm.filelist $hhmfile $hhmfile.index";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
495 &HHPaths::System($command);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
496
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
497 open (OUT, ">$hhmfile.index.sizes");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
498 print OUT "$numhhmfiles\n";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
499 close OUT;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
500 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
501 } # end if $remove==0
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
502
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
503 print("\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
504 printf("New number of a3m files in db: %i\n",$numa3mfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
505 printf("New number of $hhmext files in db: %i\n",$numhhmfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
506 printf("New number of $csext files in db: %i\n\n",$numcsfiles);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
507
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
508 my $err=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
509 if ($numa3mfiles && $numhhmfiles && $numa3mfiles != $numhhmfiles) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
510 print("**************************************************************************
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
511 WARNING: Number of a3m files not equal to number of $hhmext files\n"); $err=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
512 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
513 if ($numcsfiles && $numhhmfiles && $numcsfiles != $numhhmfiles) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
514 print("**************************************************************************
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
515 WARNING: Number of $csext files not equal to number of $hhmext files\n"); $err=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
516 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
517 if ($numcsfiles && $numa3mfiles && $numcsfiles != $numa3mfiles) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
518 print("**************************************************************************
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
519 WARNING: Number of $csext files not equal to number of a3m files\n"); $err=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
520 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
521
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
522 if ($err==1) {print("**************************************************************************
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
523 $tmpdir will not be removed to check for missing files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
524 **************************************************************************\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
525 elsif ($v<3) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
526 $command = "rm -rf $tmpdir";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
527 # &System($command);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
528 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
529 wait;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
530 exit;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
531