view perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip

Uploaded
author stheil
date Mon, 26 Oct 2015 10:59:07 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict ;
use File::Basename;
use Data::Dumper;
use Getopt::Long;
use Logger::Logger;
use Tools::Taxonomy;
use DBI;
use SQL::SplitStatement;


my $taxo_struct_dmp = 'taxonomyStructure.sql';
my $taxo_index_dmp = 'taxonomyIndex.sql';
my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
my $data_nodes = 'nodes.dmp';
my $data_names = 'names.dmp';
my $verbosity=1;


GetOptions(
  "nucl=s"  => \$data_gi_taxid_nucl,
  "prot=s"  => \$data_gi_taxid_prot,
  "names=s" => \$data_names,
  "nodes=s" => \$data_nodes,
  "v=i"     => \$verbosity,
);


Logger::Logger->changeMode($verbosity);


&main;


sub main {
  my $self={};
  bless $self;
  $self->_set_options();

  $self->{_sep} = { names	=> '\t\|\t|\t\|$',
                    nodes => '\t\|\t|\t\|$',
                    gi_taxid_prot => '\t',
                    gi_taxid_nucl => '\t',
  };

  $self->_create_sqlite_db('taxonomy.sqlite');
  my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");

  $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
  $dbh->disconnect;
}

sub _insertingCSVDataInDatabase {
	my ($self,$dbh,$tablesDataFiles) = @_;
  $logger->info('Inserting tables into database...');
	foreach my $table (keys %{$tablesDataFiles}){
    $logger->info($table);
		my $sth = $dbh->column_info( undef, undef, $table, '%');
		my $ref = $sth->fetchall_arrayref;
		my @cols = map { $_->[3] } @$ref;

		$logger->debug("Inserting data in table $table ...\n");
    $dbh->{AutoCommit} = 0;
		$sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);

		my $separator = "\t";
    if(defined $self->{_sep}->{$table}){
      $separator = $self->{_sep}->{$table};
    }
		open (DATA, $tablesDataFiles->{$table});

		while (<DATA>) {
			chomp;
			$sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
		}
		close DATA;

		$dbh->commit or $logger->logdie($dbh->errstr);
		$logger->debug("Insertion of data in table $table finished\n");
	}
}


sub _create_sqlite_db {
  my ($self,$file) = @_;
  $logger->info('Creating database.');
  if(! -e $file){
    `touch $file`;
    my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
    $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
    $dbh->disconnect;
  }
  else{
    $logger->warn('Database already exists. Skip...')
  }
}


sub _executeSQLFiles {
	my ($self,$dbh,@sqlFiles) = @_;
	my $sql_splitter = SQL::SplitStatement->new;
	foreach my $file (@sqlFiles){
    $logger->debug('Reading sql file:' . $file);
    my $cmd;
		open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
		while( <FILE> ){
      $cmd.= $_;
    }
		close FILE;

    my @statements = $sql_splitter->split($cmd);
		foreach (@statements){
      $logger->debug('Executing sql cmd:');
      $logger->debug($_);
			$dbh-> do($_) or $logger->logdie($dbh->errstr);
		}
	}
}


sub _set_options {
  my ($self)=@_;
  if(-e $taxo_struct_dmp){
    $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
  }
  else{
    $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
  }
  if(-e $taxo_index_dmp){
    $self->{_taxo_index_dmp} = $taxo_index_dmp;
  }
  else{
    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
  }
  if(-e $data_gi_taxid_nucl){
    $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
  }
  else{
    $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
  }
  if(-e $data_gi_taxid_prot){
    $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
  }
  else{
    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
  }
  if(-e $data_nodes){
    $self->{_data}->{nodes} = $data_nodes;
  }
  else{
    $logger->logdie($data_nodes . ' data_nodes file not found.');
  }
  if(-e $data_names){
    $self->{_data}->{names} = $data_names;
  }
  else{
    $logger->logdie($data_names . ' data_names file not found.');
  }
}


sub help {
my $prog = basename($0);
print STDERR <<EOF ;
#### $prog ####
#
# AUTHOR:     Sebastien THEIL
# LAST MODIF: 19/09/2015
# PURPOSE:    This script is used to load NCBI taxonomy file into a SQLite database.

USAGE:
      $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....

			### OPTIONS ###
      -nucl       <string>   gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
      -prot       <string>   gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
      -names      <string>   names.dmp file. (Default: $data_names)
      -nodes      <string>   nodes.dmp file. (Default: $data_nodes)
      -v          <int>      Verbosity level. (0 -> 4).
EOF
exit(1);
}