Mercurial > repos > stheil > taxonomy_sqlite
diff perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip
Uploaded
author | stheil |
---|---|
date | Mon, 26 Oct 2015 10:59:07 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/scripts/loadTaxonomy.pl Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,183 @@ +#!/usr/bin/perl +use strict ; +use File::Basename; +use Data::Dumper; +use Getopt::Long; +use Logger::Logger; +use Tools::Taxonomy; +use DBI; +use SQL::SplitStatement; + + +my $taxo_struct_dmp = 'taxonomyStructure.sql'; +my $taxo_index_dmp = 'taxonomyIndex.sql'; +my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; +my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; +my $data_nodes = 'nodes.dmp'; +my $data_names = 'names.dmp'; +my $verbosity=1; + + +GetOptions( + "nucl=s" => \$data_gi_taxid_nucl, + "prot=s" => \$data_gi_taxid_prot, + "names=s" => \$data_names, + "nodes=s" => \$data_nodes, + "v=i" => \$verbosity, +); + + +Logger::Logger->changeMode($verbosity); + + +&main; + + +sub main { + my $self={}; + bless $self; + $self->_set_options(); + + $self->{_sep} = { names => '\t\|\t|\t\|$', + nodes => '\t\|\t|\t\|$', + gi_taxid_prot => '\t', + gi_taxid_nucl => '\t', + }; + + $self->_create_sqlite_db('taxonomy.sqlite'); + my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); + + $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); + $dbh->disconnect; +} + +sub _insertingCSVDataInDatabase { + my ($self,$dbh,$tablesDataFiles) = @_; + $logger->info('Inserting tables into database...'); + foreach my $table (keys %{$tablesDataFiles}){ + $logger->info($table); + my $sth = $dbh->column_info( undef, undef, $table, '%'); + my $ref = $sth->fetchall_arrayref; + my @cols = map { $_->[3] } @$ref; + + $logger->debug("Inserting data in table $table ...\n"); + $dbh->{AutoCommit} = 0; + $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); + + my $separator = "\t"; + if(defined $self->{_sep}->{$table}){ + $separator = $self->{_sep}->{$table}; + } + open (DATA, $tablesDataFiles->{$table}); + + while (<DATA>) { + chomp; + $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); + } + close DATA; + + $dbh->commit or $logger->logdie($dbh->errstr); + $logger->debug("Insertion of data in table $table finished\n"); + } +} + + +sub _create_sqlite_db { + my ($self,$file) = @_; + $logger->info('Creating database.'); + if(! -e $file){ + `touch $file`; + my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); + $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); + $dbh->disconnect; + } + else{ + $logger->warn('Database already exists. Skip...') + } +} + + +sub _executeSQLFiles { + my ($self,$dbh,@sqlFiles) = @_; + my $sql_splitter = SQL::SplitStatement->new; + foreach my $file (@sqlFiles){ + $logger->debug('Reading sql file:' . $file); + my $cmd; + open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); + while( <FILE> ){ + $cmd.= $_; + } + close FILE; + + my @statements = $sql_splitter->split($cmd); + foreach (@statements){ + $logger->debug('Executing sql cmd:'); + $logger->debug($_); + $dbh-> do($_) or $logger->logdie($dbh->errstr); + } + } +} + + +sub _set_options { + my ($self)=@_; + if(-e $taxo_struct_dmp){ + $self->{_taxo_struct_dmp} = $taxo_struct_dmp; + } + else{ + $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); + } + if(-e $taxo_index_dmp){ + $self->{_taxo_index_dmp} = $taxo_index_dmp; + } + else{ + $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); + } + if(-e $data_gi_taxid_nucl){ + $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; + } + else{ + $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); + } + if(-e $data_gi_taxid_prot){ + $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; + } + else{ + $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); + } + if(-e $data_nodes){ + $self->{_data}->{nodes} = $data_nodes; + } + else{ + $logger->logdie($data_nodes . ' data_nodes file not found.'); + } + if(-e $data_names){ + $self->{_data}->{names} = $data_names; + } + else{ + $logger->logdie($data_names . ' data_names file not found.'); + } +} + + +sub help { +my $prog = basename($0); +print STDERR <<EOF ; +#### $prog #### +# +# AUTHOR: Sebastien THEIL +# LAST MODIF: 19/09/2015 +# PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database. + +USAGE: + $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... + + ### OPTIONS ### + -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) + -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) + -names <string> names.dmp file. (Default: $data_names) + -nodes <string> nodes.dmp file. (Default: $data_nodes) + -v <int> Verbosity level. (0 -> 4). +EOF +exit(1); +}