Mercurial > repos > stheil > taxonomy_sqlite
changeset 0:4ecb2ce50254 draft default tip
Uploaded
author | stheil |
---|---|
date | Mon, 26 Oct 2015 10:59:07 -0400 |
parents | |
children | |
files | perl/lib/Logger/Logger.pm perl/scripts/loadTaxonomy.pl taxonomyIndex.sql taxonomyStructure.sql tool-data/taxonomy.loc.sample tool-data/tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 7 files changed, 399 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Logger/Logger.pm Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/perl +# $Id: Logger.pm,v 1.3 2011/07/01 18:07:31 nguilhot Exp $ +package Logger::Logger; + +################################################## +## Included modules +################################################## +## Perl modules +use strict; +use warnings; +use diagnostics; +use Log::Log4perl; +use Log::Log4perl::Layout; +use Log::Log4perl::Level; +use Exporter 'import'; + +our $logger; +our @EXPORT = qw($logger); + +$logger = Log::Log4perl->get_logger(''); + +# Define stdout Appender, by default messages will only be logged to stdout +# In order to log messages to a file, you need to call the initFileLoggers +# method with the name of folder where to create the log files +my $stdout_layout = Log::Log4perl::Layout::PatternLayout->new("%5p - %m%n"); +my $stdout_appender = Log::Log4perl::Appender->new( + "Log::Log4perl::Appender::Screen", + name => 'screenlog', + stderr => 1); +$stdout_appender->layout($stdout_layout); +$stdout_appender->threshold($INFO); +$logger->add_appender($stdout_appender); +$logger->level($INFO); + + + +sub changeMode { + my ($s,$verbosity) = @_; + my $debug_layout = Log::Log4perl::Layout::PatternLayout->new("%d %5p> %F{1}:%L %M - %m%n"); + my $debug_appender = Log::Log4perl::Appender->new( + "Log::Log4perl::Appender::Screen", + name => 'debuglog', + stderr => 0); + $debug_appender->layout($debug_layout); + $logger->add_appender($debug_appender); + $logger->remove_appender('screenlog'); + + if($verbosity == 3){ + $stdout_appender->threshold($DEBUG); + $logger->level($DEBUG); + } + if($verbosity == 4){ + + $stdout_appender->threshold($TRACE); + $logger->level($TRACE); + } +} + +1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/scripts/loadTaxonomy.pl Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,183 @@ +#!/usr/bin/perl +use strict ; +use File::Basename; +use Data::Dumper; +use Getopt::Long; +use Logger::Logger; +use Tools::Taxonomy; +use DBI; +use SQL::SplitStatement; + + +my $taxo_struct_dmp = 'taxonomyStructure.sql'; +my $taxo_index_dmp = 'taxonomyIndex.sql'; +my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; +my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; +my $data_nodes = 'nodes.dmp'; +my $data_names = 'names.dmp'; +my $verbosity=1; + + +GetOptions( + "nucl=s" => \$data_gi_taxid_nucl, + "prot=s" => \$data_gi_taxid_prot, + "names=s" => \$data_names, + "nodes=s" => \$data_nodes, + "v=i" => \$verbosity, +); + + +Logger::Logger->changeMode($verbosity); + + +&main; + + +sub main { + my $self={}; + bless $self; + $self->_set_options(); + + $self->{_sep} = { names => '\t\|\t|\t\|$', + nodes => '\t\|\t|\t\|$', + gi_taxid_prot => '\t', + gi_taxid_nucl => '\t', + }; + + $self->_create_sqlite_db('taxonomy.sqlite'); + my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); + + $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); + $dbh->disconnect; +} + +sub _insertingCSVDataInDatabase { + my ($self,$dbh,$tablesDataFiles) = @_; + $logger->info('Inserting tables into database...'); + foreach my $table (keys %{$tablesDataFiles}){ + $logger->info($table); + my $sth = $dbh->column_info( undef, undef, $table, '%'); + my $ref = $sth->fetchall_arrayref; + my @cols = map { $_->[3] } @$ref; + + $logger->debug("Inserting data in table $table ...\n"); + $dbh->{AutoCommit} = 0; + $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); + + my $separator = "\t"; + if(defined $self->{_sep}->{$table}){ + $separator = $self->{_sep}->{$table}; + } + open (DATA, $tablesDataFiles->{$table}); + + while (<DATA>) { + chomp; + $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); + } + close DATA; + + $dbh->commit or $logger->logdie($dbh->errstr); + $logger->debug("Insertion of data in table $table finished\n"); + } +} + + +sub _create_sqlite_db { + my ($self,$file) = @_; + $logger->info('Creating database.'); + if(! -e $file){ + `touch $file`; + my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); + $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); + $dbh->disconnect; + } + else{ + $logger->warn('Database already exists. Skip...') + } +} + + +sub _executeSQLFiles { + my ($self,$dbh,@sqlFiles) = @_; + my $sql_splitter = SQL::SplitStatement->new; + foreach my $file (@sqlFiles){ + $logger->debug('Reading sql file:' . $file); + my $cmd; + open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); + while( <FILE> ){ + $cmd.= $_; + } + close FILE; + + my @statements = $sql_splitter->split($cmd); + foreach (@statements){ + $logger->debug('Executing sql cmd:'); + $logger->debug($_); + $dbh-> do($_) or $logger->logdie($dbh->errstr); + } + } +} + + +sub _set_options { + my ($self)=@_; + if(-e $taxo_struct_dmp){ + $self->{_taxo_struct_dmp} = $taxo_struct_dmp; + } + else{ + $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); + } + if(-e $taxo_index_dmp){ + $self->{_taxo_index_dmp} = $taxo_index_dmp; + } + else{ + $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); + } + if(-e $data_gi_taxid_nucl){ + $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; + } + else{ + $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); + } + if(-e $data_gi_taxid_prot){ + $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; + } + else{ + $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); + } + if(-e $data_nodes){ + $self->{_data}->{nodes} = $data_nodes; + } + else{ + $logger->logdie($data_nodes . ' data_nodes file not found.'); + } + if(-e $data_names){ + $self->{_data}->{names} = $data_names; + } + else{ + $logger->logdie($data_names . ' data_names file not found.'); + } +} + + +sub help { +my $prog = basename($0); +print STDERR <<EOF ; +#### $prog #### +# +# AUTHOR: Sebastien THEIL +# LAST MODIF: 19/09/2015 +# PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database. + +USAGE: + $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... + + ### OPTIONS ### + -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) + -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) + -names <string> names.dmp file. (Default: $data_names) + -nodes <string> nodes.dmp file. (Default: $data_nodes) + -v <int> Verbosity level. (0 -> 4). +EOF +exit(1); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/taxonomyIndex.sql Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,11 @@ +PRAGMA encoding = "UTF-8"; +PRAGMA default_synchronous = OFF; + +CREATE INDEX tax_id ON names(tax_id); +CREATE INDEX unique_name ON names(unique_name); +CREATE INDEX name_txt ON names(name_txt); + +CREATE INDEX parent_tax_id ON nodes(parent_tax_id); + +CREATE INDEX nucl_taxid ON gi_taxid_nucl(tax_id); +CREATE INDEX prot_taxid ON gi_taxid_prot(tax_id);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/taxonomyStructure.sql Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,90 @@ +-- MySQL dump 10.13 Distrib 5.5.31, for debian-linux-gnu (x86_64) +-- +-- Host: localhost Database: taxonomy +-- ------------------------------------------------------ +-- Server version 5.5.31-0ubuntu0.12.04.2 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `gi_taxid_nucl` +-- +PRAGMA encoding = "UTF-8"; +PRAGMA default_synchronous = OFF; +CREATE TABLE `gi_taxid_nucl` ( + `gi` double NOT NULL, + `tax_id` double NOT NULL, + PRIMARY KEY (`gi`) +) ; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `gi_taxid_prot` +-- + + +CREATE TABLE `gi_taxid_prot` ( + `gi` double NOT NULL, + `tax_id` double NOT NULL, + PRIMARY KEY (`gi`) +) ; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `names` +-- + + +CREATE TABLE `names` ( + `tax_id` int unsigned NOT NULL DEFAULT '0', + `name_txt` varchar(255) NOT NULL DEFAULT '', + `unique_name` varchar(255) DEFAULT NULL, + `name_class` varchar(32) NOT NULL DEFAULT '' +) ; +/*!40101 SET character_set_client = @saved_cs_client */; + + +-- +-- Table structure for table `nodes` +-- + + +CREATE TABLE `nodes` ( + `tax_id` int unsigned NOT NULL DEFAULT '0', + `parent_tax_id` int unsigned NOT NULL DEFAULT '0', + `rank` varchar(32) DEFAULT NULL, + `embl_code` varchar(16) DEFAULT NULL, + `division_id` int NOT NULL DEFAULT '0', + `inherited_div_flag` int NOT NULL DEFAULT '0', + `genetic_code_id` int NOT NULL DEFAULT '0', + `inherited_GC_flag` int NOT NULL DEFAULT '0', + `mitochondrial_genetic_code_id` int NOT NULL DEFAULT '0', + `inherited_MGC_flag` int NOT NULL DEFAULT '0', + `GenBank_hidden_flag` int NOT NULL DEFAULT '0', + `hidden_subtree_root_flag` int NOT NULL DEFAULT '0', + `comments` varchar(255) DEFAULT NULL, + PRIMARY KEY (`tax_id`) +) ; + + +/*!40101 SET character_set_client = @saved_cs_client */; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2013-07-02 10:17:11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/taxonomy.loc.sample Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,1 @@ +taxonomy full taxonomy /media/data/db/taxonomy/taxonomy.sqlite
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/tool_data_table_conf.xml.sample Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,6 @@ +<tables> + <table name="taxonomy_sqlite" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/taxonomy.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Oct 26 10:59:07 2015 -0400 @@ -0,0 +1,49 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="perl" version="5.18.1"> + <repository changeset_revision="114b6af405fa" name="package_perl_5_18" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu"/> + </package> + <package name="loadTaxonomy" version="1.0"> + <install version="1.0"> + <actions> + <action type="set_environment_for_install"> + <repository changeset_revision="114b6af405fa" name="package_perl_5_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="perl" version="5.18.1" /> + </repository> + </action> + <action type="setup_perl_environment"> + <package>Data::Dumper</package> + <package>File::Basename</package> + <package>Getopt::Long</package> + <package>DBI</package> + <package>SQL::SplitStatement</package> + <package>$INSTALL_DIR/perl/lib</package> + </action> + <action type="download_by_url"> + ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz + </action> + <action type="shell_command"> + gunzip gi_taxid_nucl.dmp.gz + </action> + <action type="download_by_url"> + ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz + </action> + <action type="shell_command"> + gunzip gi_taxid_prot.dmp.gz + </action> + <action type="download_by_url"> + ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip + </action> + <action type="shell_command"> + gunzip taxdmp.zip + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/perl/scripts</environment_variable> + </action> + <action type="shell_command"> + loadTaxonomy.pl + </action> + </actions> + </install> + </package> +</tool_dependency>