diff perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip

Uploaded
author stheil
date Mon, 26 Oct 2015 10:59:07 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/perl/scripts/loadTaxonomy.pl	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,183 @@
+#!/usr/bin/perl
+use strict ;
+use File::Basename;
+use Data::Dumper;
+use Getopt::Long;
+use Logger::Logger;
+use Tools::Taxonomy;
+use DBI;
+use SQL::SplitStatement;
+
+
+my $taxo_struct_dmp = 'taxonomyStructure.sql';
+my $taxo_index_dmp = 'taxonomyIndex.sql';
+my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
+my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
+my $data_nodes = 'nodes.dmp';
+my $data_names = 'names.dmp';
+my $verbosity=1;
+
+
+GetOptions(
+  "nucl=s"  => \$data_gi_taxid_nucl,
+  "prot=s"  => \$data_gi_taxid_prot,
+  "names=s" => \$data_names,
+  "nodes=s" => \$data_nodes,
+  "v=i"     => \$verbosity,
+);
+
+
+Logger::Logger->changeMode($verbosity);
+
+
+&main;
+
+
+sub main {
+  my $self={};
+  bless $self;
+  $self->_set_options();
+
+  $self->{_sep} = { names	=> '\t\|\t|\t\|$',
+                    nodes => '\t\|\t|\t\|$',
+                    gi_taxid_prot => '\t',
+                    gi_taxid_nucl => '\t',
+  };
+
+  $self->_create_sqlite_db('taxonomy.sqlite');
+  my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");
+
+  $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
+  $dbh->disconnect;
+}
+
+sub _insertingCSVDataInDatabase {
+	my ($self,$dbh,$tablesDataFiles) = @_;
+  $logger->info('Inserting tables into database...');
+	foreach my $table (keys %{$tablesDataFiles}){
+    $logger->info($table);
+		my $sth = $dbh->column_info( undef, undef, $table, '%');
+		my $ref = $sth->fetchall_arrayref;
+		my @cols = map { $_->[3] } @$ref;
+
+		$logger->debug("Inserting data in table $table ...\n");
+    $dbh->{AutoCommit} = 0;
+		$sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);
+
+		my $separator = "\t";
+    if(defined $self->{_sep}->{$table}){
+      $separator = $self->{_sep}->{$table};
+    }
+		open (DATA, $tablesDataFiles->{$table});
+
+		while (<DATA>) {
+			chomp;
+			$sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
+		}
+		close DATA;
+
+		$dbh->commit or $logger->logdie($dbh->errstr);
+		$logger->debug("Insertion of data in table $table finished\n");
+	}
+}
+
+
+sub _create_sqlite_db {
+  my ($self,$file) = @_;
+  $logger->info('Creating database.');
+  if(! -e $file){
+    `touch $file`;
+    my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
+    $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
+    $dbh->disconnect;
+  }
+  else{
+    $logger->warn('Database already exists. Skip...')
+  }
+}
+
+
+sub _executeSQLFiles {
+	my ($self,$dbh,@sqlFiles) = @_;
+	my $sql_splitter = SQL::SplitStatement->new;
+	foreach my $file (@sqlFiles){
+    $logger->debug('Reading sql file:' . $file);
+    my $cmd;
+		open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
+		while( <FILE> ){
+      $cmd.= $_;
+    }
+		close FILE;
+
+    my @statements = $sql_splitter->split($cmd);
+		foreach (@statements){
+      $logger->debug('Executing sql cmd:');
+      $logger->debug($_);
+			$dbh-> do($_) or $logger->logdie($dbh->errstr);
+		}
+	}
+}
+
+
+sub _set_options {
+  my ($self)=@_;
+  if(-e $taxo_struct_dmp){
+    $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
+  }
+  else{
+    $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
+  }
+  if(-e $taxo_index_dmp){
+    $self->{_taxo_index_dmp} = $taxo_index_dmp;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
+  }
+  if(-e $data_gi_taxid_nucl){
+    $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
+  }
+  if(-e $data_gi_taxid_prot){
+    $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
+  }
+  if(-e $data_nodes){
+    $self->{_data}->{nodes} = $data_nodes;
+  }
+  else{
+    $logger->logdie($data_nodes . ' data_nodes file not found.');
+  }
+  if(-e $data_names){
+    $self->{_data}->{names} = $data_names;
+  }
+  else{
+    $logger->logdie($data_names . ' data_names file not found.');
+  }
+}
+
+
+sub help {
+my $prog = basename($0);
+print STDERR <<EOF ;
+#### $prog ####
+#
+# AUTHOR:     Sebastien THEIL
+# LAST MODIF: 19/09/2015
+# PURPOSE:    This script is used to load NCBI taxonomy file into a SQLite database.
+
+USAGE:
+      $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
+
+			### OPTIONS ###
+      -nucl       <string>   gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
+      -prot       <string>   gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
+      -names      <string>   names.dmp file. (Default: $data_names)
+      -nodes      <string>   nodes.dmp file. (Default: $data_nodes)
+      -v          <int>      Verbosity level. (0 -> 4).
+EOF
+exit(1);
+}