Mercurial > repos > greg > plant_tribes_gene_family_scaffold_updater
comparison gene_family_scaffold_updater.pl @ 0:2b0906489073 draft default tip
Uploaded
author | greg |
---|---|
date | Tue, 21 Aug 2018 13:00:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2b0906489073 |
---|---|
1 #!/usr/bin/env perl | |
2 # Author: Eric Wafula | |
3 # Email: ekw10@psu.edu | |
4 # Institution: Penn State University, Biology Dept, Claude dePamphilis Lab | |
5 # Date: June 2018 | |
6 | |
7 use strict; | |
8 use warnings; | |
9 use File::Spec; | |
10 use File::Basename; | |
11 use Getopt::Long qw(:config no_ignore_case); | |
12 use FindBin; | |
13 use DBI; | |
14 | |
15 my $home = "$FindBin::Bin/.."; | |
16 | |
17 my $usage = <<__EOUSAGE__; | |
18 | |
19 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
20 # | |
21 # GENE FAMILY SCAFFOLD UPDATER | |
22 # | |
23 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
24 # Required Options: | |
25 # | |
26 # | |
27 # --database_connection_string <string> : Postgres database connection string using format | |
28 # postgresql://<user>:<password>@<host>/<database name> | |
29 # | |
30 # --proteins <string> : Amino acids (proteins) sequences fasta file (proteins.fasta) | |
31 # This can either be an absolute path or just the file name | |
32 # | |
33 # --coding_sequences <string> : Corresponding coding sequences (CDS) fasta file (cds.fasta) | |
34 # | |
35 # --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute | |
36 # path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1) | |
37 # or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to | |
38 # the scaffold to create the absolute path. | |
39 # the scaffold to create the absolute path. | |
40 # If Monocots clusters (version 1.0): 12Gv1.0 | |
41 # If Angiosperms clusters (version 1.0): 22Gv1.0 | |
42 # If Angiosperms clusters (version 1.1): 22Gv1.1 | |
43 # If Green plants clusters (version 1.0): 31Gv1.0 | |
44 # If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold, | |
45 # and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold | |
46 # data on how data files and directories are named, formated, and organized. | |
47 # | |
48 # | |
49 # --species_name <string> : Name of the species | |
50 # | |
51 # --species_code <string> : Code of the species | |
52 # | |
53 # --species_family <string> : Family of the species | |
54 # | |
55 # --species_order <string> : Order of the species | |
56 # | |
57 # --species_group <string> : Group of the species | |
58 # | |
59 # --species_clade <string> : Clade of the species | |
60 # | |
61 # --rooting_order_species_code <string> : Species code after which the new species will be placed in the rooting order config file | |
62 # | |
63 # # # # # # # # # # # # # # # # # # | |
64 # Others Options: | |
65 # | |
66 # --num_threads <int> : number of threads (CPUs) to used for HMMScan, BLASTP, and MAFFT | |
67 # Default: 1 | |
68 # | |
69 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
70 # Example Usage: | |
71 # | |
72 # GeneFamilyScaffoldUpdater --database_connection_string postgresql://<user>:<password>@<host>/<database name> | |
73 --proteins proteins.fasta --coding_sequences cds.fasta --scaffold 22Gv1.1 | |
74 # --species_name Fake genome --species_family Brassicaceae --species_order Brassicales | |
75 --species_group Rosids --species_clade Core Eudicots --rooting_order_species_code Phypa | |
76 # | |
77 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
78 | |
79 __EOUSAGE__ | |
80 ; | |
81 | |
82 # Declare and initialize variables; | |
83 my $database_connection_string; | |
84 my $proteins; | |
85 my $species_name; | |
86 my $species_code; | |
87 my $species_family; | |
88 my $species_order; | |
89 my $species_group; | |
90 my $species_clade; | |
91 my $coding_sequences; | |
92 my $scaffold; | |
93 my $rooting_order_species_code; | |
94 my $num_threads; | |
95 | |
96 my $options = GetOptions ( 'database_connection_string=s' => \$database_connection_string, | |
97 'proteins=s' => \$proteins, | |
98 'species_name=s' => \$species_name, | |
99 'species_code=s' => \$species_code, | |
100 'species_family=s' => \$species_family, | |
101 'species_order=s' => \$species_order, | |
102 'species_group=s' => \$species_group, | |
103 'species_clade=s' => \$species_clade, | |
104 'coding_sequences=s' => \$coding_sequences, | |
105 'scaffold=s' => \$scaffold, | |
106 'rooting_order_species_code=s' => \$rooting_order_species_code, | |
107 'num_threads=i' => \$num_threads, | |
108 ); | |
109 | |
110 # # # # # # # # # # # # # # # # # # # # # # # # # validate options and set variables # # # # # # # # # # # # # # # # # # # # # # # # # # | |
111 # check if options are set | |
112 unless ( $options ) { die $usage; } | |
113 unless ( $database_connection_string and $proteins and $species_name and $species_code and $species_family and $species_order and $species_group and $species_clade and $coding_sequences and $scaffold and $rooting_order_species_code ) { | |
114 print "\nOne or more required options not set\n"; die $usage; } | |
115 # get scaffold directory | |
116 my $scaffold_dir; | |
117 if (File::Spec->file_name_is_absolute($scaffold)) { | |
118 $scaffold_dir = $scaffold; | |
119 $scaffold = basename($scaffold); | |
120 } else { | |
121 if ($scaffold) { $scaffold_dir = "$home/data/$scaffold"; } | |
122 else { print "\n --scaffold option is not set\n\n"; die $usage; } | |
123 } | |
124 | |
125 # validate scaffold and update type options | |
126 if ( $scaffold !~ /^\d+Gv\d+\.\d+$/) { | |
127 print "\nOrthogroups or gene families proteins scaffold name $scaffold is not in the required format"; | |
128 print " i.e. XGvY.Z, where X is number species in the scaffold, and Y.Z version number such as 12Gv1.0.\n"; | |
129 die $usage; | |
130 } | |
131 | |
132 # Find out if the received rooting_order_species_code is in | |
133 # the rooting order configuration file for the scaffold. We | |
134 # do this before anything else since it is the least resource | |
135 # intensive and an invalid species code will force an error. | |
136 validate_rooting_order_species_code($scaffold_dir, $rooting_order_species_code); | |
137 | |
138 # Get a database connection. | |
139 my $dbh = get_database_connection($database_connection_string); | |
140 | |
141 # The gene_family_scaffold_loader tool must be executed before | |
142 # this tool so that the information about the scaffold is | |
143 # avaialble in the galaxy_plant_tribes database. Check to make | |
144 # sure the scaffold has been loaded into the databse before | |
145 # continuing with the update. | |
146 validate_scaffold($dbh, $scaffold); | |
147 | |
148 # get scaffold clustering methods | |
149 my %methods; | |
150 my $annotation_dir = "$scaffold_dir/annot"; | |
151 opendir (DIR, "$annotation_dir") or die "Can't open $annotation_dir directory\n"; | |
152 while (my $filename = readdir(DIR)) { | |
153 if ($filename =~ /(\w+)\.list$/) { $methods{$1} = $1; } | |
154 } | |
155 | |
156 # set defaults | |
157 if (!$num_threads) { $num_threads = 1; } | |
158 | |
159 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routine calls # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
160 | |
161 log_msg("Starting gene family scaffold updating."); | |
162 | |
163 # Create working directory. | |
164 my $working_dir = "./geneFamilyScaffoldUpdate_dir"; | |
165 if (-d $working_dir) { | |
166 die "Exiting...!\nGene family scaffold update output directory ($working_dir) already exists!\n\n"; | |
167 } | |
168 make_directory($working_dir); | |
169 | |
170 # Copy original scaffold data to a working directory. | |
171 log_msg("Copying original scaffold data to working directory."); | |
172 my $copy_scaffold_data = system "cp -r $scaffold_dir $working_dir"; | |
173 if ($copy_scaffold_data != 0) { | |
174 stop_err("Copying original scaffold data to working directory failed."); | |
175 } | |
176 | |
177 # Update the scaffold config files in the working directory with the new genome. | |
178 update_config_files ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir ); | |
179 | |
180 # Update the scaffold files in the working directory with the new genome. | |
181 foreach my $method (keys %methods) { | |
182 sort_sequences ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir ); | |
183 } | |
184 | |
185 # Move updated scaffold data to original directory. | |
186 my $updated_scaffold_dir = "$working_dir/$scaffold"; | |
187 log_msg("Removing original scaffold data directory $scaffold_dir."); | |
188 my $remove_scaffold_data = system "rm -rf $scaffold_dir"; | |
189 if ($remove_scaffold_data != 0) { | |
190 stop_err("Removing original scaffold data directory failed."); | |
191 } | |
192 log_msg("Moving updated scaffold data from\n$updated_scaffold_dir\nto original directory\n$scaffold_dir."); | |
193 my $move_scaffold_data = system "mv $updated_scaffold_dir $scaffold_dir"; | |
194 if ($move_scaffold_data != 0) { | |
195 stop_err("Moving updated scaffold data to original directory failed."); | |
196 } | |
197 | |
198 # Update the database tables with the new genome. | |
199 update_database_tables ( $dbh, $proteins, $scaffold, \%methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir ); | |
200 | |
201 log_msg("Completed gene family scaffold updating."); | |
202 | |
203 exit(0); | |
204 | |
205 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
206 | |
207 sub log_msg { | |
208 my ($msg) = @_; | |
209 print localtime()." - ".$msg."\n\n"; | |
210 } | |
211 | |
212 sub stop_err { | |
213 my ($error_msg) = @_; | |
214 print "\n-- ".localtime()." - ".$error_msg."\n\n"; | |
215 exit(1); | |
216 } | |
217 | |
218 sub validate_rooting_order_species_code { | |
219 my ($scaffold_dir, $rooting_order_species_code ) = @_; | |
220 my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config"; | |
221 open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n"; | |
222 while (<IN>) { | |
223 chomp; | |
224 if (/^#/ || /^$/ || /^\[/) { | |
225 # Skip comments, blasnk lines and section headers. | |
226 next; | |
227 } | |
228 # Example line: Physcomitrella patens=Phypa | |
229 my @F = split(/=/, $_); | |
230 my $rooting_order_species_code_cmp = $F[1]; | |
231 if (defined($rooting_order_species_code_cmp) && $rooting_order_species_code_cmp eq $rooting_order_species_code) { | |
232 return; | |
233 } | |
234 } | |
235 stop_err("Invalid rooting order species code $rooting_order_species_code is not found in $rooting_order_config"); | |
236 } | |
237 | |
238 sub validate_scaffold { | |
239 my ($dbh, $scaffold) = @_; | |
240 my ($stmt, $sth, $rv); | |
241 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold';); | |
242 $sth = $dbh->prepare( $stmt ); | |
243 $rv = $sth->execute() or die $DBI::errstr; | |
244 if ($rv < 0) { print $DBI::errstr; } | |
245 if ($sth->rows > 0) { | |
246 return; | |
247 } | |
248 stop_err("The scaffold $scaffold has not been loaded into the database - use the GeneFamilyScaffoldLoader tool to load the scaffold before attempting to update the scaffold with this tool."); | |
249 } | |
250 | |
251 sub make_directory { | |
252 my ( $new_dir ) = @_; | |
253 if (!-d $new_dir) { | |
254 mkdir($new_dir, 0755); | |
255 } | |
256 } | |
257 | |
258 sub get_database_connection { | |
259 my ($database_connection_string) = @_; | |
260 # Database connection and variables, the format of database_connection_string is | |
261 # postgresql://<user>:<password>@<host>/<database name> | |
262 my @conn_part = split(/:\/\//, $database_connection_string); | |
263 my $conn_part_str = $conn_part[1]; | |
264 my $driver = "Pg"; | |
265 my @conn_part2 = split(/\//, $conn_part_str); | |
266 my $database = $conn_part2[1]; | |
267 my $dsn = "DBI:$driver:dbname = $database;host = 127.0.0.1;port = 5432"; | |
268 @conn_part2 = split(/:/, $conn_part_str); | |
269 my $userid = $conn_part2[0]; | |
270 @conn_part2 = split(/@/, $conn_part_str); | |
271 my $conn_part2_str = $conn_part2[0]; | |
272 my @conn_part3 = split(/:/, $conn_part2_str); | |
273 my $password = $conn_part3[1]; | |
274 my $dbh = DBI->connect($dsn, $userid, $password, { RaiseError => 1 }) or die "$DBI::errstr\nError : Unable to open database galaxy_plant_tribes\n"; | |
275 log_msg("Successfully connected to database $database."); | |
276 return $dbh; | |
277 } | |
278 | |
279 sub update_config_files { | |
280 my ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir ) = @_; | |
281 log_msg("Updating scaffold config files in working directory."); | |
282 # Update the rootingOrder config file. | |
283 my $rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config"; | |
284 my $tmp_rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config.tmp"; | |
285 open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n"; | |
286 open (OUT, ">$tmp_rooting_order_config") or die "Can't open $tmp_rooting_order_config file\n"; | |
287 my $inserted = 0; | |
288 while (<IN>) { | |
289 # Example line: Physcomitrella patens=Phypa | |
290 chomp; | |
291 print OUT "$_\n"; | |
292 if (not $inserted) { | |
293 if (not /^#/ && not /^$/ && not /^\[/) { | |
294 my @F=split(/=/, $_); | |
295 my $cmp_species_code = $F[1]; | |
296 if (defined($cmp_species_code) && $cmp_species_code eq $rooting_order_species_code) { | |
297 print OUT "$species_name=$species_code\n"; | |
298 $inserted = 1; | |
299 } | |
300 } | |
301 } | |
302 } | |
303 close OUT; | |
304 close IN; | |
305 my $update_rooting_order = system "mv $tmp_rooting_order_config $rooting_order_config >/dev/null"; | |
306 if ($update_rooting_order != 0) { | |
307 stop_err("Updating rooting order config in working directory failed."); | |
308 } | |
309 # Update the taxaLineage config file. | |
310 my $taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config"; | |
311 # Make sure the last line of the file ends with a newline character | |
312 # by rewriting the entire file. | |
313 my $tmp_taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config.tmp"; | |
314 open (IN, $taxa_lineage_config) or die "Can't open $taxa_lineage_config\n"; | |
315 open (OUT, ">$tmp_taxa_lineage_config") or die "Can't open $tmp_taxa_lineage_config file\n"; | |
316 while (<IN>) { | |
317 chomp; | |
318 print OUT "$_\n"; | |
319 } | |
320 close OUT; | |
321 close IN; | |
322 my $update_taxa = system "mv $tmp_taxa_lineage_config $taxa_lineage_config >/dev/null"; | |
323 if ($update_taxa != 0) { | |
324 stop_err("Updating taxa lineage config in working directory failed."); | |
325 } | |
326 # Append the new species information to the file. | |
327 open (OUT, ">>$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n"; | |
328 print OUT "$species_name\t$species_family\t$species_order\t$species_group\t$species_clade\n"; | |
329 close OUT; | |
330 } | |
331 | |
332 sub sort_sequences { | |
333 my ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir ) = @_; | |
334 my $method_dir = "$working_dir/$method"; | |
335 make_directory($method_dir); | |
336 log_msg("Sorting working directory protein sequences in the $method clustering method."); | |
337 log_msg("Running BLASTP."); | |
338 my $blastp_call = system "blastp -outfmt 6 -evalue 1e-5 -num_threads $num_threads -query $proteins -db $scaffold_dir/db/blast/$method -out $method_dir/proteins.blastp >/dev/null"; | |
339 if ($blastp_call != 0) { | |
340 stop_err("Running BLASTP failed."); | |
341 } | |
342 log_msg("Getting best BLASTP hits."); | |
343 my $blast_results = "proteins.blastp"; | |
344 get_best_blastp_orthos ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir ); | |
345 log_msg("Running HMMScan."); | |
346 my $hmmscan_call = system "hmmscan -E 1e-5 --cpu $num_threads --noali --tblout $method_dir/proteins.hmmscan -o $method_dir/hmmscan.log $scaffold_dir/db/hmm/$method $proteins >/dev/null"; | |
347 if ($hmmscan_call != 0) { | |
348 stop_err("Running HMMScan failed."); | |
349 } | |
350 log_msg("Getting best HMMScan hits."); | |
351 my $hmmscan_results = "proteins.hmmscan"; | |
352 get_best_hmmscan_orthos ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir ); | |
353 get_blast_hmmscan_orthos ( $scaffold, $method, $method_dir, $scaffold_dir ); | |
354 get_orthogroup_fasta ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir ); | |
355 update_scaffold_data( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir ); | |
356 } | |
357 | |
358 sub get_best_blastp_orthos { | |
359 my ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_; | |
360 my (%best, %max, %list); | |
361 open (IN, "$method_dir/$blast_results") or die "Can't open $method_dir/$blast_results file\n"; | |
362 while (<IN>) { | |
363 chomp; | |
364 my @F=split(/\t/, $_); | |
365 if ($F[0] eq $F[1]) { next; } | |
366 if (!$best{$F[0]}) { | |
367 $best{$F[0]} = $_; | |
368 $max{$F[0]} = $F[11]; | |
369 } | |
370 else { | |
371 if ($F[11] > $max{$F[0]}) { | |
372 $best{$F[0]} = $_; | |
373 $max{$F[0]} = $F[11]; | |
374 } | |
375 } | |
376 } | |
377 close IN; | |
378 open (IN, "$scaffold_dir/annot/$method.list") or die "Can't open $scaffold_dir/annot/$method.list file\n"; | |
379 while (<IN>) { | |
380 chomp; | |
381 my @F=split(/\t/, $_); | |
382 $list{$F[1]} = $F[0]; | |
383 } | |
384 close IN; | |
385 open (OUT, ">$method_dir/$blast_results.bestOrthos") or die "Can't open $method_dir/$blast_results.bestOrthos file\n"; | |
386 print OUT "Gene ID\tOrthogroup ID\n"; | |
387 foreach (keys %best) { | |
388 my @F = split(/\t/, $best{$_}); | |
389 print OUT "$F[0]\t$list{$F[1]}\n"; | |
390 } | |
391 close OUT; | |
392 } | |
393 | |
394 sub get_best_hmmscan_orthos { | |
395 my ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_; | |
396 my %hits; | |
397 open (IN, "$method_dir/$hmmscan_results") or die "Can't open $method_dir/$hmmscan_results file\n"; | |
398 while (<IN>) { | |
399 if (/^#/){next;} | |
400 my @F = split(/\s+/, $_); | |
401 $hits{$F[2]}{$F[0]} = $F[5]; | |
402 } | |
403 close IN; | |
404 open (OUT, ">$method_dir/$hmmscan_results.bestOrthos") or die "Can't open $method_dir/$hmmscan_results.bestOrthos file\n"; | |
405 print OUT "Gene ID\tOrthogroup ID\n"; | |
406 for my $hit (keys %hits) { | |
407 my $score = 0; | |
408 my $best_target; | |
409 for my $target (keys %{$hits{$hit}}) { | |
410 if ($hits{$hit}{$target} >= $score) { | |
411 $score = $hits{$hit}{$target}; | |
412 $best_target = $target; | |
413 } | |
414 } | |
415 print OUT "$hit\t$best_target\n"; | |
416 } | |
417 close OUT; | |
418 } | |
419 | |
420 sub get_blast_hmmscan_orthos { | |
421 my ( $scaffold, $method, $method_dir, $scaffold_dir ) = @_; | |
422 my (%blastp, %hmmscan, %genes); | |
423 opendir (DIR, "$method_dir") or die "Can't open $method_dir directory\n"; | |
424 while (my $filename = readdir(DIR)) { | |
425 if ($filename =~ /^proteins\.blastp\.bestOrthos$/){ | |
426 open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n"; | |
427 while (<IN>) { | |
428 chomp; | |
429 if (/^Gene/) {next;} | |
430 my @F = split(/\t/, $_); | |
431 $blastp{$F[0]} = $F[1]; | |
432 $genes{$F[0]} = $F[0]; | |
433 } | |
434 close IN; | |
435 } | |
436 if ($filename =~ /^proteins\.hmmscan\.bestOrthos$/){ | |
437 open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n"; | |
438 while (<IN>) { | |
439 chomp; | |
440 if (/^Gene/) {next;} | |
441 my @F = split(/\t/, $_); | |
442 $hmmscan{$F[0]} = $F[1]; | |
443 $genes{$F[0]} = $F[0]; | |
444 } | |
445 close IN; | |
446 } | |
447 } | |
448 closedir DIR; | |
449 open (OUT, ">$method_dir/proteins.both.bestOrthos") or die "Can't open $method_dir/protein.both.bestOrthos file\n"; | |
450 print OUT "Gene ID\tOrthogroup ID\n"; | |
451 foreach (sort keys %genes) { | |
452 if (!$blastp{$_} and $hmmscan{$_}) { print OUT "$_\t$hmmscan{$_}\n"; next; } | |
453 elsif ($blastp{$_} and !$hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next; } | |
454 elsif ($blastp{$_} == $hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next } | |
455 else { print OUT "$_\t$hmmscan{$_}\n"; } | |
456 } | |
457 close OUT; | |
458 } | |
459 | |
460 sub get_orthogroup_fasta { | |
461 my ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir ) = @_; | |
462 log_msg("Retrieving orthogroup fasta files."); | |
463 my (%orthos, %pep, %cds); | |
464 my $orthogroup_fasta = "$method_dir/orthogroups_fasta"; | |
465 make_directory($orthogroup_fasta); | |
466 my $orthogroup_assignment = "proteins.both.bestOrthos"; | |
467 open (IN, "$method_dir/$orthogroup_assignment") or die "Can't open $method_dir/$orthogroup_assignment file\n"; | |
468 while (<IN>) { | |
469 chomp; | |
470 if ($_ =~ /^Gene/) { next; } | |
471 my @F = split(/\t/, $_); | |
472 $orthos{$F[1]}{$F[0]} = $F[0]; | |
473 } | |
474 close IN; | |
475 %pep = get_sequences ($proteins); | |
476 if ($coding_sequences) { %cds = get_sequences ($coding_sequences); } | |
477 my ($ortho_id, $seq_id); | |
478 foreach $ortho_id (keys %orthos) { | |
479 open (PEP, ">$orthogroup_fasta/$ortho_id.faa") or die "Can't open $orthogroup_fasta/$ortho_id.faa file\n"; | |
480 if ($coding_sequences) { open (CDS, ">$orthogroup_fasta/$ortho_id.fna") or die "Can't open $orthogroup_fasta/$ortho_id.fna file\n"; } | |
481 foreach $seq_id (sort keys %{$orthos{$ortho_id}}) { | |
482 $pep{$seq_id} =~ s/.{80}(?=.)/$&\n/g; | |
483 print PEP ">$seq_id\n$pep{$seq_id}\n"; | |
484 if ($coding_sequences) { | |
485 $cds{$seq_id} =~ s/.{80}(?=.)/$&\n/g; | |
486 print CDS ">$seq_id\n$cds{$seq_id}\n"; | |
487 } | |
488 } | |
489 close PEP; | |
490 close CDS; | |
491 } | |
492 my @files; | |
493 my $formated_fasta = "$orthogroup_fasta/formated_fasta"; | |
494 make_directory($formated_fasta); | |
495 opendir(DIR, "$orthogroup_fasta") or die "Can't open $orthogroup_fasta directory\n"; | |
496 while(my $filename = readdir(DIR)) { | |
497 if($filename =~ /\d+\.fna/ or $filename =~ /\d+\.faa/){ | |
498 push (@files, $filename); | |
499 } | |
500 } | |
501 closedir(DIR); | |
502 foreach my $file (@files) { | |
503 open (IN, "$orthogroup_fasta/$file") or die "Can't open $orthogroup_fasta/$file file\n"; | |
504 open (OUT, ">$formated_fasta/$file") or die "Can't open $formated_fasta/$file file\n"; | |
505 while (<IN>) { | |
506 chomp; | |
507 if(/^>/){ s/\|/_/g; print OUT "$_\n"; } | |
508 else { print OUT "$_\n"; } | |
509 } | |
510 close IN; | |
511 close OUT; | |
512 } | |
513 my $integrated_orthogroup_fasta = "$method_dir/integrated_orthogroup_fasta"; | |
514 make_directory($integrated_orthogroup_fasta); | |
515 integrate_orthogroup_fasta ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir ); | |
516 } | |
517 | |
518 sub get_sequences { | |
519 my ( $file ) = @_; | |
520 my (%sequences, $id); | |
521 open (IN, "$file") or die "Can't open $file file\n"; | |
522 while (<IN>) { | |
523 if ($_ =~ />(\S+)/){ $id = $1; } | |
524 else { s/\s+//g; $sequences{$id} .= $_; } | |
525 } | |
526 close IN; | |
527 return %sequences; | |
528 } | |
529 | |
530 sub integrate_orthogroup_fasta { | |
531 my ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir) = @_; | |
532 log_msg("Integrating orthogroup fasta files."); | |
533 my (%pep, %cds); | |
534 opendir (DIR, "$formated_fasta") or die "Can't open $formated_fasta directory\n"; | |
535 while ( my $filename = readdir(DIR) ) { | |
536 if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; } | |
537 if ($filename =~ /^(\d+)\.fna$/) { $cds{$1} = $1; } | |
538 } | |
539 closedir DIR; | |
540 if (keys(%cds) and (keys(%pep) != keys(%cds))) { | |
541 die "Exiting...!\nOrthogroup classification protein and CDS fasta files not equivalent in $formated_fasta directory\n\n"; | |
542 } | |
543 foreach my $ortho_id (keys %pep) { | |
544 my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.faa $formated_fasta/$ortho_id.faa > $integrated_orthogroup_fasta/$ortho_id.faa"; | |
545 if ($merging_call != 0) { | |
546 stop_err("Merging orthogroup $ortho_id failed."); | |
547 } | |
548 if (keys(%cds) and $cds{$ortho_id}) { | |
549 my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.fna $formated_fasta/$ortho_id.fna > $integrated_orthogroup_fasta/$ortho_id.fna"; | |
550 if ($merging_call != 0) { | |
551 stop_err("Merging orthogroup $ortho_id failed."); | |
552 } | |
553 } | |
554 } | |
555 } | |
556 | |
557 sub update_scaffold_data { | |
558 my ( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir ) = @_; | |
559 log_msg("Updating scaffold data files."); | |
560 # update orthogroup annotation files | |
561 log_msg("Updating orthogroup annotation files."); | |
562 my %annot; | |
563 open (OUT, ">>$working_dir/$scaffold/annot/$method.list") or die "Can't open $working_dir/$scaffold/annot/$method.list file\n"; | |
564 opendir (DIR, "$method_dir/orthogroups_fasta") or die "Can't open $method_dir/orthogroups_fasta directory\n"; | |
565 while ( my $filename = readdir(DIR) ) { | |
566 my $seq_count = 0; | |
567 my $ortho_id; | |
568 if ($filename =~ /^(\d+)\.faa$/) { | |
569 $ortho_id = $1; | |
570 open (IN, "$method_dir/orthogroups_fasta/$filename") or die "Can't open $method_dir/orthogroups_fasta/$filename file\n"; | |
571 while (<IN>) { | |
572 chomp; | |
573 if (/^>(\S+)/) { | |
574 print OUT "$ortho_id\t$1\n"; | |
575 $seq_count++; | |
576 } | |
577 } | |
578 close IN; | |
579 } | |
580 else { next; } | |
581 $annot{$ortho_id} = $seq_count; | |
582 } | |
583 closedir DIR; | |
584 close OUT; | |
585 my ($fields, %avg_summary, %min_summary); | |
586 if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); } | |
587 open (IN, "$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n"; | |
588 while (<IN>) { | |
589 chomp; | |
590 if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; } | |
591 else { /(\d+)\s+(.*)/; $avg_summary{$1} = $2; } | |
592 } | |
593 close IN; | |
594 open (OUT, ">$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n"; | |
595 print OUT "$fields\n"; | |
596 foreach (sort {$a<=>$b} keys %avg_summary) { | |
597 if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$avg_summary{$_}\n"; } | |
598 else { print OUT "$_\t0\t$avg_summary{$_}\n"; } | |
599 } | |
600 close OUT; | |
601 open (IN, "$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n"; | |
602 while (<IN>) { | |
603 chomp; | |
604 if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; } | |
605 else { /(\d+)\s+(.*)/; $min_summary{$1} = $2; } | |
606 } | |
607 close IN; | |
608 open (OUT, ">$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n"; | |
609 print OUT "$fields\n"; | |
610 foreach (sort {$a<=>$b} keys %min_summary) { | |
611 if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$min_summary{$_}\n"; } | |
612 else { print OUT "$_\t0\t$min_summary{$_}\n"; } | |
613 } | |
614 close OUT; | |
615 | |
616 # update orthogroup fasta files | |
617 log_msg("Updating orthogroup fasta files."); | |
618 opendir (DIR, "$method_dir/integrated_orthogroup_fasta") or die "Can't open $method_dir/integrated_orthogroup_fasta directory\n"; | |
619 while ( my $filename = readdir(DIR) ) { | |
620 if (($filename =~ /^(\d+)\.faa$/) or ($filename =~ /^(\d+)\.fna$/)) { | |
621 my $update_orthogroup_fasta = system "cp $method_dir/integrated_orthogroup_fasta/$filename $working_dir/$scaffold/fasta/$method/$filename >/dev/null"; | |
622 if ($update_orthogroup_fasta != 0) { | |
623 stop_err("Updating orthogroup fasta $filename failed."); | |
624 } | |
625 } | |
626 } | |
627 close IN; | |
628 closedir DIR; | |
629 | |
630 # update orthogroup alignments | |
631 log_msg("Updating alignments."); | |
632 opendir (DIR, "$method_dir/orthogroups_fasta/formated_fasta") or die "Can't open $method_dir/orthogroups_fasta/formated_fasta directory\n"; | |
633 while ( my $filename = readdir(DIR) ) { | |
634 if ($filename =~ /^(\d+)\.faa$/) { | |
635 my $ortho_id = $1; | |
636 my $align_fasta = system "mafft --thread $num_threads --add $method_dir/orthogroups_fasta/formated_fasta/$filename $working_dir/$scaffold/alns/$method/$ortho_id.aln > $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln 2>/dev/null"; | |
637 if ($align_fasta != 0) { | |
638 stop_err("Aligning orthogroup fasta file $filename failed."); | |
639 } | |
640 my $update_orthogroup_alignment = system "mv $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null"; | |
641 if ($update_orthogroup_alignment != 0) { | |
642 stop_err(" - Updating orthogroup alignment $ortho_id.aln failed."); | |
643 } | |
644 } | |
645 } | |
646 closedir DIR; | |
647 | |
648 # update orthogroup hmm profiles | |
649 log_msg("Updating hmm profiles."); | |
650 opendir (DIR, "$working_dir/$scaffold/alns/$method") or die "Can't open $working_dir/$scaffold/alns/$method directory\n"; | |
651 while ( my $filename = readdir(DIR) ) { | |
652 if ($filename =~ /^(\d+)\.aln$/) { | |
653 my $ortho_id = $1; | |
654 my $update_orthogroup_hmm = system "hmmbuild -n $ortho_id --amino --cpu $num_threads $working_dir/$ortho_id.hmm $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null"; | |
655 if ($update_orthogroup_hmm != 0) { | |
656 stop_err("Updating orthogroup hmm profile $ortho_id.hmm failed."); | |
657 } | |
658 my $convert_hmm_format = system "hmmconvert $working_dir/$ortho_id.hmm > $working_dir/$scaffold/hmms/$method/$ortho_id.hmm"; | |
659 if ($convert_hmm_format != 0) { | |
660 stop_err("Converting orthogroup hmm profile $ortho_id.hmm format failed."); | |
661 } | |
662 my $remove_tmp_file = system "rm $working_dir/$ortho_id.hmm >/dev/null"; | |
663 if ($remove_tmp_file != 0) { | |
664 stop_err("Could not remove temporary hmm profile $ortho_id.hmm."); | |
665 } | |
666 } | |
667 } | |
668 | |
669 # update orthogroup blast and hmm databases | |
670 log_msg("Updating blast and hmm databases."); | |
671 my $update_blast_database = system "find $method_dir/orthogroups_fasta/ -name \"*.faa\" -print0 | xargs -0 cat >> $working_dir/$scaffold/db/blast/$method"; | |
672 if ($update_blast_database != 0) { | |
673 stop_err("Updating blast database failed."); | |
674 } | |
675 my $index_blast_database = system "makeblastdb -in $working_dir/$scaffold/db/blast/$method -dbtype prot >/dev/null"; | |
676 if ($index_blast_database != 0) { | |
677 stop_err("Indexing blast database failed."); | |
678 } | |
679 my $update_hmm_database = system "find $working_dir/$scaffold/hmms/$method/ -name \"*.hmm\" -print0 | xargs -0 cat > $working_dir/$scaffold/db/hmm/$method"; | |
680 if ($update_hmm_database != 0) { | |
681 stop_err("Updating hmm database failed."); | |
682 } | |
683 my $index_hmm_database = system "hmmpress -f $working_dir/$scaffold/db/hmm/$method >/dev/null"; | |
684 if ($index_hmm_database != 0) { | |
685 stop_err("Indexing hmm database failed."); | |
686 } | |
687 } | |
688 | |
689 sub update_database_tables { | |
690 my ( $dbh, $proteins, $scaffold, $methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir ) = @_; | |
691 log_msg("Updating for database tables."); | |
692 my ( $species_code, %method_genes, %gene_sequences, %dna, %aa ); | |
693 my $gsot_association_prep_file = "$working_dir/gene_scaffold_orthogroup_taxon_association.tsv"; | |
694 my $num_recs = 0; | |
695 | |
696 # Output a prep file that stores information for updating | |
697 # the gene_scaffold_orthogroup_taxon_association table. | |
698 open (ASSOC, ">$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n"; | |
699 print ASSOC "gene_id\tscaffold_id\tclustering_method\torthogroup_id\tspecies_name\n"; | |
700 # get new species name and code | |
701 if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); } | |
702 $species_name =~ s/\_/ /g; | |
703 my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config"; | |
704 open(IN, "$rooting_order_config") or die "Can't open $rooting_order_config file\n"; | |
705 while (<IN>){ | |
706 chomp; | |
707 if (/^\#/ or /^\s+/ or /^\[/){ next; } | |
708 if (/(\w+\s+\w+)\=(\w+)/) { if ($species_name eq $1) { $species_code = $2;} } | |
709 } | |
710 close IN; | |
711 foreach my $clustering_method (keys %$methods) { | |
712 # Updating orthogroup database table | |
713 log_msg("Updating $clustering_method records for the plant_tribes_orthogroup database table."); | |
714 my ( $stmt, $sth, $rv, $scaffold_id ); | |
715 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold' AND clustering_method = '$clustering_method';); | |
716 $sth = $dbh->prepare( $stmt ); | |
717 $rv = $sth->execute() or die $DBI::errstr; | |
718 if ($rv < 0) { print $DBI::errstr; } | |
719 while (my @row = $sth->fetchrow_array()) { | |
720 $scaffold_id = $row[0]; | |
721 } | |
722 my $scaffold_annotation_dir = "$scaffold_dir/annot"; | |
723 opendir (DIR, $scaffold_annotation_dir) or die "Can't open $scaffold_annotation_dir directory\n"; | |
724 $num_recs = 0; | |
725 while ( my $filename = readdir(DIR) ) { | |
726 if ($filename =~ /$clustering_method.min_evalue\.summary/) { | |
727 open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n"; | |
728 while (<IN>){ | |
729 chomp; | |
730 if (/^Orthogroup/){ next; } | |
731 my @fields = split(/\t/, $_); | |
732 my $num_species = 0; | |
733 my $num_genes = 0; | |
734 $scaffold =~ /(\d+)Gv\d+\.\d+/; # 22Gv1.1 | |
735 my $genomes = $1 + 1; | |
736 for (1..$genomes){ | |
737 if ($fields[$_] > 0){ $num_species++; } | |
738 $num_genes += $fields[$_]; | |
739 } | |
740 $stmt = qq(UPDATE plant_tribes_orthogroup SET num_species = $num_species, num_genes = $num_genes WHERE orthogroup_id = $fields[0] AND scaffold_id = $scaffold_id;); | |
741 $rv = $dbh->do($stmt) or die $DBI::errstr; | |
742 if($rv < 0) { | |
743 print $DBI::errstr; | |
744 } | |
745 $num_recs = $num_recs + 1; | |
746 } | |
747 close IN; | |
748 } | |
749 if ($filename =~ /$clustering_method\.list/) { | |
750 open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n"; | |
751 while (<IN>){ | |
752 chomp; | |
753 my @fields = split(/\t/, $_); | |
754 my @gene_id = split(/\|/, $fields[1]); | |
755 if ($gene_id[1] =~ /$species_code/) { | |
756 $fields[1] =~ s/\|/_/g; | |
757 $method_genes{$clustering_method}{$fields[0]}{$fields[1]} = $fields[1]; | |
758 } | |
759 } | |
760 close IN; | |
761 } | |
762 } | |
763 close DIR; | |
764 log_msg("$num_recs records for $scaffold $clustering_method were successfully updated in the plant_tribes_orthogroup table."); | |
765 # Updating taxon database table | |
766 log_msg("Inserting $clustering_method records into the plant_tribes_taxon database table."); | |
767 my $num_genes = 0; | |
768 foreach my $ortho_id (keys %{$method_genes{$clustering_method}}){ | |
769 foreach (keys %{$method_genes{$clustering_method}{$ortho_id}}){ | |
770 $num_genes++; | |
771 } | |
772 } | |
773 my $taxa_lineage_config = "$scaffold_dir/$scaffold.taxaLineage.config"; | |
774 open(IN, "$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n"; | |
775 $num_recs = 0; | |
776 while (<IN>){ | |
777 chomp; | |
778 my @fields = split(/\t/, $_); | |
779 if ($fields[0] ne $species_name) { next; } | |
780 $stmt = qq(INSERT INTO plant_tribes_taxon (species_name, scaffold_id, num_genes, species_family, species_order, species_group, species_clade) VALUES ('$fields[0]', $scaffold_id, $num_genes, '$fields[1]', '$fields[2]', '$fields[3]', '$fields[4]')); | |
781 $rv = $dbh->do($stmt) or die $DBI::errstr; | |
782 $num_recs = $num_recs + 1; | |
783 } | |
784 close IN; | |
785 log_msg("$num_recs records for $species_name $scaffold $clustering_method were successfully inserted into the plant_tribes_taxon table."); | |
786 my ($dna_id, $aa_id); | |
787 my $orthogroups_fasta_dir = "$working_dir/$clustering_method/orthogroups_fasta/formated_fasta"; | |
788 opendir (DIR, $orthogroups_fasta_dir) or die "Can't open $orthogroups_fasta_dir directory\n"; | |
789 while ( my $filename = readdir(DIR) ) { | |
790 if ($filename =~ /^(\d+)\.fna$/) { | |
791 my $ortho_id = $1; | |
792 open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n"; | |
793 while(<IN>){ | |
794 chomp; | |
795 if (/^>(\S+)/){ | |
796 $dna_id = $1; | |
797 print ASSOC "$dna_id\t$scaffold\t$clustering_method\t$ortho_id\t$species_name\n"; | |
798 next; | |
799 } | |
800 else { s/\s+//g; $dna{$dna_id} .= $_; } | |
801 } | |
802 close IN; | |
803 } | |
804 if ($filename =~ /^(\d+)\.faa$/) { | |
805 open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n"; | |
806 while(<IN>){ | |
807 chomp; | |
808 if (/^>(\S+)/){ $aa_id = $1; next; } | |
809 else { s/\s+//g; $aa{$aa_id} .= $_; } | |
810 } | |
811 close IN; | |
812 } | |
813 } | |
814 close DIR; | |
815 } | |
816 close ASSOC; | |
817 # Updating gene database table | |
818 log_msg("Inserting records into the plant_tribes_gene database table."); | |
819 $num_recs = 0; | |
820 foreach my $gene_id (sort keys %dna) { | |
821 my $stmt = qq(INSERT INTO plant_tribes_gene (gene_id, dna_sequence, aa_sequence) VALUES ('$gene_id', '$dna{$gene_id}', '$aa{$gene_id}')); | |
822 my $rv = $dbh->do($stmt) or die $DBI::errstr; | |
823 $num_recs = $num_recs + 1; | |
824 } | |
825 log_msg("$num_recs records for $species_name $scaffold were successfully inserted into the plant_tribes_gene table."); | |
826 # Updaing gene-scaffold-orthogroup-taxon-association database table | |
827 log_msg("Inserting records into the gene_scaffold_orthogroup_taxon_association database table."); | |
828 open(IN, "$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n"; | |
829 $num_recs = 0; | |
830 my ( $stmt, $sth, $rv, $scaffold_id, $clustering_method, $orthogroup_id, $taxon_id, $gene_id ); | |
831 my ( $gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db ); | |
832 while(<IN>){ | |
833 chomp; | |
834 if (/^gene_id/) { | |
835 # gene_id scaffold_id clustering_method orthogroup_id species_name | |
836 next; | |
837 } | |
838 my @fields = split(/\t/, $_); | |
839 # gnl_Fakge_v1.0_AT1G03390.1 22Gv1.1 orthomcl 3 Fake genome | |
840 $gene_id = $fields[0]; | |
841 $scaffold_id = $fields[1]; | |
842 $clustering_method = $fields[2]; | |
843 $orthogroup_id = $fields[3]; | |
844 $species_name = $fields[4]; | |
845 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold_id' AND clustering_method = '$clustering_method';); | |
846 $sth = $dbh->prepare( $stmt ); | |
847 $rv = $sth->execute() or die $DBI::errstr; | |
848 if ($rv < 0) { print $DBI::errstr; } | |
849 while (my @row = $sth->fetchrow_array()) { | |
850 $scaffold_id_db = $row[0]; | |
851 } | |
852 $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '$orthogroup_id' AND scaffold_id = '$scaffold_id_db';); | |
853 $sth = $dbh->prepare( $stmt ); | |
854 $rv = $sth->execute() or die $DBI::errstr; | |
855 if ($rv < 0) { print $DBI::errstr; } | |
856 while (my @row = $sth->fetchrow_array()) { | |
857 $orthogroup_id_db = $row[0]; | |
858 } | |
859 $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = '$species_name' AND scaffold_id = '$scaffold_id_db';); | |
860 $sth = $dbh->prepare( $stmt ); | |
861 $rv = $sth->execute() or die $DBI::errstr; | |
862 if ($rv < 0) { print $DBI::errstr; } | |
863 while (my @row = $sth->fetchrow_array()) { | |
864 $taxon_id_db = $row[0]; | |
865 } | |
866 $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = '$gene_id' ); | |
867 $sth = $dbh->prepare( $stmt ); | |
868 $rv = $sth->execute() or die $DBI::errstr; | |
869 if ($rv < 0) { print $DBI::errstr; } | |
870 while (my @row = $sth->fetchrow_array()) { | |
871 $gene_id_db = $row[0]; | |
872 } | |
873 $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db)); | |
874 $rv = $dbh->do($stmt) or die $DBI::errstr; | |
875 $num_recs = $num_recs + 1; | |
876 } | |
877 close IN; | |
878 log_msg("$num_recs records for $scaffold $clustering_method were successfully inserted into the gene_scaffold_orthogroup_taxon_association table."); | |
879 $dbh->disconnect(); | |
880 } |