Mercurial > repos > stheil > taxonomy_sqlite
view perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip
Uploaded
author | stheil |
---|---|
date | Mon, 26 Oct 2015 10:59:07 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict ; use File::Basename; use Data::Dumper; use Getopt::Long; use Logger::Logger; use Tools::Taxonomy; use DBI; use SQL::SplitStatement; my $taxo_struct_dmp = 'taxonomyStructure.sql'; my $taxo_index_dmp = 'taxonomyIndex.sql'; my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; my $data_nodes = 'nodes.dmp'; my $data_names = 'names.dmp'; my $verbosity=1; GetOptions( "nucl=s" => \$data_gi_taxid_nucl, "prot=s" => \$data_gi_taxid_prot, "names=s" => \$data_names, "nodes=s" => \$data_nodes, "v=i" => \$verbosity, ); Logger::Logger->changeMode($verbosity); &main; sub main { my $self={}; bless $self; $self->_set_options(); $self->{_sep} = { names => '\t\|\t|\t\|$', nodes => '\t\|\t|\t\|$', gi_taxid_prot => '\t', gi_taxid_nucl => '\t', }; $self->_create_sqlite_db('taxonomy.sqlite'); my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); $dbh->disconnect; } sub _insertingCSVDataInDatabase { my ($self,$dbh,$tablesDataFiles) = @_; $logger->info('Inserting tables into database...'); foreach my $table (keys %{$tablesDataFiles}){ $logger->info($table); my $sth = $dbh->column_info( undef, undef, $table, '%'); my $ref = $sth->fetchall_arrayref; my @cols = map { $_->[3] } @$ref; $logger->debug("Inserting data in table $table ...\n"); $dbh->{AutoCommit} = 0; $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); my $separator = "\t"; if(defined $self->{_sep}->{$table}){ $separator = $self->{_sep}->{$table}; } open (DATA, $tablesDataFiles->{$table}); while (<DATA>) { chomp; $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); } close DATA; $dbh->commit or $logger->logdie($dbh->errstr); $logger->debug("Insertion of data in table $table finished\n"); } } sub _create_sqlite_db { my ($self,$file) = @_; $logger->info('Creating database.'); if(! -e $file){ `touch $file`; my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); $dbh->disconnect; } else{ $logger->warn('Database already exists. Skip...') } } sub _executeSQLFiles { my ($self,$dbh,@sqlFiles) = @_; my $sql_splitter = SQL::SplitStatement->new; foreach my $file (@sqlFiles){ $logger->debug('Reading sql file:' . $file); my $cmd; open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); while( <FILE> ){ $cmd.= $_; } close FILE; my @statements = $sql_splitter->split($cmd); foreach (@statements){ $logger->debug('Executing sql cmd:'); $logger->debug($_); $dbh-> do($_) or $logger->logdie($dbh->errstr); } } } sub _set_options { my ($self)=@_; if(-e $taxo_struct_dmp){ $self->{_taxo_struct_dmp} = $taxo_struct_dmp; } else{ $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); } if(-e $taxo_index_dmp){ $self->{_taxo_index_dmp} = $taxo_index_dmp; } else{ $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); } if(-e $data_gi_taxid_nucl){ $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; } else{ $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); } if(-e $data_gi_taxid_prot){ $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; } else{ $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); } if(-e $data_nodes){ $self->{_data}->{nodes} = $data_nodes; } else{ $logger->logdie($data_nodes . ' data_nodes file not found.'); } if(-e $data_names){ $self->{_data}->{names} = $data_names; } else{ $logger->logdie($data_names . ' data_names file not found.'); } } sub help { my $prog = basename($0); print STDERR <<EOF ; #### $prog #### # # AUTHOR: Sebastien THEIL # LAST MODIF: 19/09/2015 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database. USAGE: $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... ### OPTIONS ### -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) -names <string> names.dmp file. (Default: $data_names) -nodes <string> nodes.dmp file. (Default: $data_nodes) -v <int> Verbosity level. (0 -> 4). EOF exit(1); }