changeset 0:4ecb2ce50254 draft default tip

Uploaded
author stheil
date Mon, 26 Oct 2015 10:59:07 -0400
parents
children
files perl/lib/Logger/Logger.pm perl/scripts/loadTaxonomy.pl taxonomyIndex.sql taxonomyStructure.sql tool-data/taxonomy.loc.sample tool-data/tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 7 files changed, 399 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/perl/lib/Logger/Logger.pm	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+# $Id: Logger.pm,v 1.3 2011/07/01 18:07:31 nguilhot Exp $
+package Logger::Logger;
+
+##################################################
+## Included modules
+##################################################
+## Perl modules
+use strict;
+use warnings;
+use diagnostics;
+use Log::Log4perl;
+use Log::Log4perl::Layout;
+use Log::Log4perl::Level;
+use Exporter 'import';
+
+our $logger;
+our @EXPORT = qw($logger);
+
+$logger = Log::Log4perl->get_logger('');
+
+# Define stdout Appender, by default messages will only be logged to stdout
+# In order to log messages to a file, you need to call the initFileLoggers
+# method with the name of folder where to create the log files
+my $stdout_layout = Log::Log4perl::Layout::PatternLayout->new("%5p - %m%n");
+my $stdout_appender =  Log::Log4perl::Appender->new(
+        "Log::Log4perl::Appender::Screen",
+        name      => 'screenlog',
+        stderr    => 1);
+$stdout_appender->layout($stdout_layout);
+$stdout_appender->threshold($INFO);
+$logger->add_appender($stdout_appender);
+$logger->level($INFO);
+
+
+
+sub changeMode {
+    my ($s,$verbosity) = @_;
+    my $debug_layout = Log::Log4perl::Layout::PatternLayout->new("%d %5p> %F{1}:%L %M - %m%n");
+    my $debug_appender = Log::Log4perl::Appender->new(
+        "Log::Log4perl::Appender::Screen",
+        name      => 'debuglog',
+        stderr    => 0);
+    $debug_appender->layout($debug_layout);
+    $logger->add_appender($debug_appender);
+    $logger->remove_appender('screenlog');
+
+    if($verbosity == 3){
+        $stdout_appender->threshold($DEBUG);
+        $logger->level($DEBUG);
+    }
+    if($verbosity == 4){
+
+        $stdout_appender->threshold($TRACE);
+        $logger->level($TRACE);
+    }
+}
+
+1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/perl/scripts/loadTaxonomy.pl	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,183 @@
+#!/usr/bin/perl
+use strict ;
+use File::Basename;
+use Data::Dumper;
+use Getopt::Long;
+use Logger::Logger;
+use Tools::Taxonomy;
+use DBI;
+use SQL::SplitStatement;
+
+
+my $taxo_struct_dmp = 'taxonomyStructure.sql';
+my $taxo_index_dmp = 'taxonomyIndex.sql';
+my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
+my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
+my $data_nodes = 'nodes.dmp';
+my $data_names = 'names.dmp';
+my $verbosity=1;
+
+
+GetOptions(
+  "nucl=s"  => \$data_gi_taxid_nucl,
+  "prot=s"  => \$data_gi_taxid_prot,
+  "names=s" => \$data_names,
+  "nodes=s" => \$data_nodes,
+  "v=i"     => \$verbosity,
+);
+
+
+Logger::Logger->changeMode($verbosity);
+
+
+&main;
+
+
+sub main {
+  my $self={};
+  bless $self;
+  $self->_set_options();
+
+  $self->{_sep} = { names	=> '\t\|\t|\t\|$',
+                    nodes => '\t\|\t|\t\|$',
+                    gi_taxid_prot => '\t',
+                    gi_taxid_nucl => '\t',
+  };
+
+  $self->_create_sqlite_db('taxonomy.sqlite');
+  my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");
+
+  $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
+  $dbh->disconnect;
+}
+
+sub _insertingCSVDataInDatabase {
+	my ($self,$dbh,$tablesDataFiles) = @_;
+  $logger->info('Inserting tables into database...');
+	foreach my $table (keys %{$tablesDataFiles}){
+    $logger->info($table);
+		my $sth = $dbh->column_info( undef, undef, $table, '%');
+		my $ref = $sth->fetchall_arrayref;
+		my @cols = map { $_->[3] } @$ref;
+
+		$logger->debug("Inserting data in table $table ...\n");
+    $dbh->{AutoCommit} = 0;
+		$sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);
+
+		my $separator = "\t";
+    if(defined $self->{_sep}->{$table}){
+      $separator = $self->{_sep}->{$table};
+    }
+		open (DATA, $tablesDataFiles->{$table});
+
+		while (<DATA>) {
+			chomp;
+			$sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
+		}
+		close DATA;
+
+		$dbh->commit or $logger->logdie($dbh->errstr);
+		$logger->debug("Insertion of data in table $table finished\n");
+	}
+}
+
+
+sub _create_sqlite_db {
+  my ($self,$file) = @_;
+  $logger->info('Creating database.');
+  if(! -e $file){
+    `touch $file`;
+    my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
+    $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
+    $dbh->disconnect;
+  }
+  else{
+    $logger->warn('Database already exists. Skip...')
+  }
+}
+
+
+sub _executeSQLFiles {
+	my ($self,$dbh,@sqlFiles) = @_;
+	my $sql_splitter = SQL::SplitStatement->new;
+	foreach my $file (@sqlFiles){
+    $logger->debug('Reading sql file:' . $file);
+    my $cmd;
+		open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
+		while( <FILE> ){
+      $cmd.= $_;
+    }
+		close FILE;
+
+    my @statements = $sql_splitter->split($cmd);
+		foreach (@statements){
+      $logger->debug('Executing sql cmd:');
+      $logger->debug($_);
+			$dbh-> do($_) or $logger->logdie($dbh->errstr);
+		}
+	}
+}
+
+
+sub _set_options {
+  my ($self)=@_;
+  if(-e $taxo_struct_dmp){
+    $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
+  }
+  else{
+    $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
+  }
+  if(-e $taxo_index_dmp){
+    $self->{_taxo_index_dmp} = $taxo_index_dmp;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
+  }
+  if(-e $data_gi_taxid_nucl){
+    $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
+  }
+  if(-e $data_gi_taxid_prot){
+    $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
+  }
+  else{
+    $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
+  }
+  if(-e $data_nodes){
+    $self->{_data}->{nodes} = $data_nodes;
+  }
+  else{
+    $logger->logdie($data_nodes . ' data_nodes file not found.');
+  }
+  if(-e $data_names){
+    $self->{_data}->{names} = $data_names;
+  }
+  else{
+    $logger->logdie($data_names . ' data_names file not found.');
+  }
+}
+
+
+sub help {
+my $prog = basename($0);
+print STDERR <<EOF ;
+#### $prog ####
+#
+# AUTHOR:     Sebastien THEIL
+# LAST MODIF: 19/09/2015
+# PURPOSE:    This script is used to load NCBI taxonomy file into a SQLite database.
+
+USAGE:
+      $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
+
+			### OPTIONS ###
+      -nucl       <string>   gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
+      -prot       <string>   gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
+      -names      <string>   names.dmp file. (Default: $data_names)
+      -nodes      <string>   nodes.dmp file. (Default: $data_nodes)
+      -v          <int>      Verbosity level. (0 -> 4).
+EOF
+exit(1);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/taxonomyIndex.sql	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,11 @@
+PRAGMA encoding = "UTF-8";
+PRAGMA default_synchronous = OFF;
+
+CREATE INDEX tax_id ON names(tax_id);
+CREATE INDEX unique_name ON names(unique_name);
+CREATE INDEX name_txt ON names(name_txt);
+
+CREATE INDEX parent_tax_id ON nodes(parent_tax_id);
+
+CREATE INDEX nucl_taxid ON gi_taxid_nucl(tax_id);
+CREATE INDEX prot_taxid ON gi_taxid_prot(tax_id);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/taxonomyStructure.sql	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,90 @@
+-- MySQL dump 10.13  Distrib 5.5.31, for debian-linux-gnu (x86_64)
+--
+-- Host: localhost    Database: taxonomy
+-- ------------------------------------------------------
+-- Server version	5.5.31-0ubuntu0.12.04.2
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8 */;
+/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
+/*!40103 SET TIME_ZONE='+00:00' */;
+/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
+/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
+/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
+/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
+
+--
+-- Table structure for table `gi_taxid_nucl`
+--
+PRAGMA encoding = "UTF-8";
+PRAGMA default_synchronous = OFF;
+CREATE TABLE `gi_taxid_nucl` (
+  `gi` double NOT NULL,
+  `tax_id` double NOT NULL,
+  PRIMARY KEY (`gi`)
+) ;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Table structure for table `gi_taxid_prot`
+--
+
+
+CREATE TABLE `gi_taxid_prot` (
+  `gi` double NOT NULL,
+  `tax_id` double NOT NULL,
+  PRIMARY KEY (`gi`)
+) ;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Table structure for table `names`
+--
+
+
+CREATE TABLE `names` (
+  `tax_id` int unsigned NOT NULL DEFAULT '0',
+  `name_txt` varchar(255) NOT NULL DEFAULT '',
+  `unique_name` varchar(255) DEFAULT NULL,
+  `name_class` varchar(32) NOT NULL DEFAULT ''
+) ;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+
+--
+-- Table structure for table `nodes`
+--
+
+
+CREATE TABLE `nodes` (
+  `tax_id` int unsigned NOT NULL DEFAULT '0',
+  `parent_tax_id` int unsigned NOT NULL DEFAULT '0',
+  `rank` varchar(32) DEFAULT NULL,
+  `embl_code` varchar(16) DEFAULT NULL,
+  `division_id` int NOT NULL DEFAULT '0',
+  `inherited_div_flag` int NOT NULL DEFAULT '0',
+  `genetic_code_id` int NOT NULL DEFAULT '0',
+  `inherited_GC_flag` int NOT NULL DEFAULT '0',
+  `mitochondrial_genetic_code_id` int NOT NULL DEFAULT '0',
+  `inherited_MGC_flag` int NOT NULL DEFAULT '0',
+  `GenBank_hidden_flag` int NOT NULL DEFAULT '0',
+  `hidden_subtree_root_flag` int NOT NULL DEFAULT '0',
+  `comments` varchar(255) DEFAULT NULL,
+  PRIMARY KEY (`tax_id`)
+) ;
+
+
+/*!40101 SET character_set_client = @saved_cs_client */;
+/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
+
+/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
+/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
+/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
+/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
+
+-- Dump completed on 2013-07-02 10:17:11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/taxonomy.loc.sample	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,1 @@
+taxonomy	full taxonomy	/media/data/db/taxonomy/taxonomy.sqlite
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/tool_data_table_conf.xml.sample	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,6 @@
+<tables>
+    <table name="taxonomy_sqlite" comment_char="#">
+      <columns>value, name, path</columns>
+      <file path="tool-data/taxonomy.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Oct 26 10:59:07 2015 -0400
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="perl" version="5.18.1">
+    <repository changeset_revision="114b6af405fa" name="package_perl_5_18" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu"/>
+  </package>
+  <package name="loadTaxonomy" version="1.0">
+    <install version="1.0">
+      <actions>
+        <action type="set_environment_for_install">
+          <repository changeset_revision="114b6af405fa" name="package_perl_5_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu">
+            <package name="perl" version="5.18.1" />
+          </repository>
+        </action>
+        <action type="setup_perl_environment">
+          <package>Data::Dumper</package>
+          <package>File::Basename</package>
+          <package>Getopt::Long</package>
+          <package>DBI</package>
+          <package>SQL::SplitStatement</package>
+          <package>$INSTALL_DIR/perl/lib</package>
+        </action>
+        <action type="download_by_url">
+          ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
+        </action>
+        <action type="shell_command">
+          gunzip gi_taxid_nucl.dmp.gz
+        </action>
+        <action type="download_by_url">
+          ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz
+        </action>
+        <action type="shell_command">
+          gunzip gi_taxid_prot.dmp.gz
+        </action>
+        <action type="download_by_url">
+          ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
+        </action>
+        <action type="shell_command">
+          gunzip taxdmp.zip
+        </action>
+        <action type="set_environment">
+          <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/perl/scripts</environment_variable>
+        </action>
+        <action type="shell_command">
+          loadTaxonomy.pl
+        </action>
+      </actions>
+    </install>
+  </package>
+</tool_dependency>