| 0 | 1 #!/usr/bin/perl | 
|  | 2 use strict ; | 
|  | 3 use File::Basename; | 
|  | 4 use Data::Dumper; | 
|  | 5 use Getopt::Long; | 
|  | 6 use Logger::Logger; | 
|  | 7 use Tools::Taxonomy; | 
|  | 8 use DBI; | 
|  | 9 use SQL::SplitStatement; | 
|  | 10 | 
|  | 11 | 
|  | 12 my $taxo_struct_dmp = 'taxonomyStructure.sql'; | 
|  | 13 my $taxo_index_dmp = 'taxonomyIndex.sql'; | 
|  | 14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; | 
|  | 15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; | 
|  | 16 my $data_nodes = 'nodes.dmp'; | 
|  | 17 my $data_names = 'names.dmp'; | 
|  | 18 my $verbosity=1; | 
|  | 19 | 
|  | 20 | 
|  | 21 GetOptions( | 
|  | 22   "nucl=s"  => \$data_gi_taxid_nucl, | 
|  | 23   "prot=s"  => \$data_gi_taxid_prot, | 
|  | 24   "names=s" => \$data_names, | 
|  | 25   "nodes=s" => \$data_nodes, | 
|  | 26   "v=i"     => \$verbosity, | 
|  | 27 ); | 
|  | 28 | 
|  | 29 | 
|  | 30 Logger::Logger->changeMode($verbosity); | 
|  | 31 | 
|  | 32 | 
|  | 33 &main; | 
|  | 34 | 
|  | 35 | 
|  | 36 sub main { | 
|  | 37   my $self={}; | 
|  | 38   bless $self; | 
|  | 39   $self->_set_options(); | 
|  | 40 | 
|  | 41   $self->{_sep} = { names	=> '\t\|\t|\t\|$', | 
|  | 42                     nodes => '\t\|\t|\t\|$', | 
|  | 43                     gi_taxid_prot => '\t', | 
|  | 44                     gi_taxid_nucl => '\t', | 
|  | 45   }; | 
|  | 46 | 
|  | 47   $self->_create_sqlite_db('taxonomy.sqlite'); | 
|  | 48   my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); | 
|  | 49 | 
|  | 50   $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); | 
|  | 51   $dbh->disconnect; | 
|  | 52 } | 
|  | 53 | 
|  | 54 sub _insertingCSVDataInDatabase { | 
|  | 55 	my ($self,$dbh,$tablesDataFiles) = @_; | 
|  | 56   $logger->info('Inserting tables into database...'); | 
|  | 57 	foreach my $table (keys %{$tablesDataFiles}){ | 
|  | 58     $logger->info($table); | 
|  | 59 		my $sth = $dbh->column_info( undef, undef, $table, '%'); | 
|  | 60 		my $ref = $sth->fetchall_arrayref; | 
|  | 61 		my @cols = map { $_->[3] } @$ref; | 
|  | 62 | 
|  | 63 		$logger->debug("Inserting data in table $table ...\n"); | 
|  | 64     $dbh->{AutoCommit} = 0; | 
|  | 65 		$sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); | 
|  | 66 | 
|  | 67 		my $separator = "\t"; | 
|  | 68     if(defined $self->{_sep}->{$table}){ | 
|  | 69       $separator = $self->{_sep}->{$table}; | 
|  | 70     } | 
|  | 71 		open (DATA, $tablesDataFiles->{$table}); | 
|  | 72 | 
|  | 73 		while (<DATA>) { | 
|  | 74 			chomp; | 
|  | 75 			$sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); | 
|  | 76 		} | 
|  | 77 		close DATA; | 
|  | 78 | 
|  | 79 		$dbh->commit or $logger->logdie($dbh->errstr); | 
|  | 80 		$logger->debug("Insertion of data in table $table finished\n"); | 
|  | 81 	} | 
|  | 82 } | 
|  | 83 | 
|  | 84 | 
|  | 85 sub _create_sqlite_db { | 
|  | 86   my ($self,$file) = @_; | 
|  | 87   $logger->info('Creating database.'); | 
|  | 88   if(! -e $file){ | 
|  | 89     `touch $file`; | 
|  | 90     my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); | 
|  | 91     $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); | 
|  | 92     $dbh->disconnect; | 
|  | 93   } | 
|  | 94   else{ | 
|  | 95     $logger->warn('Database already exists. Skip...') | 
|  | 96   } | 
|  | 97 } | 
|  | 98 | 
|  | 99 | 
|  | 100 sub _executeSQLFiles { | 
|  | 101 	my ($self,$dbh,@sqlFiles) = @_; | 
|  | 102 	my $sql_splitter = SQL::SplitStatement->new; | 
|  | 103 	foreach my $file (@sqlFiles){ | 
|  | 104     $logger->debug('Reading sql file:' . $file); | 
|  | 105     my $cmd; | 
|  | 106 		open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); | 
|  | 107 		while( <FILE> ){ | 
|  | 108       $cmd.= $_; | 
|  | 109     } | 
|  | 110 		close FILE; | 
|  | 111 | 
|  | 112     my @statements = $sql_splitter->split($cmd); | 
|  | 113 		foreach (@statements){ | 
|  | 114       $logger->debug('Executing sql cmd:'); | 
|  | 115       $logger->debug($_); | 
|  | 116 			$dbh-> do($_) or $logger->logdie($dbh->errstr); | 
|  | 117 		} | 
|  | 118 	} | 
|  | 119 } | 
|  | 120 | 
|  | 121 | 
|  | 122 sub _set_options { | 
|  | 123   my ($self)=@_; | 
|  | 124   if(-e $taxo_struct_dmp){ | 
|  | 125     $self->{_taxo_struct_dmp} = $taxo_struct_dmp; | 
|  | 126   } | 
|  | 127   else{ | 
|  | 128     $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); | 
|  | 129   } | 
|  | 130   if(-e $taxo_index_dmp){ | 
|  | 131     $self->{_taxo_index_dmp} = $taxo_index_dmp; | 
|  | 132   } | 
|  | 133   else{ | 
|  | 134     $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | 
|  | 135   } | 
|  | 136   if(-e $data_gi_taxid_nucl){ | 
|  | 137     $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; | 
|  | 138   } | 
|  | 139   else{ | 
|  | 140     $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); | 
|  | 141   } | 
|  | 142   if(-e $data_gi_taxid_prot){ | 
|  | 143     $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; | 
|  | 144   } | 
|  | 145   else{ | 
|  | 146     $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | 
|  | 147   } | 
|  | 148   if(-e $data_nodes){ | 
|  | 149     $self->{_data}->{nodes} = $data_nodes; | 
|  | 150   } | 
|  | 151   else{ | 
|  | 152     $logger->logdie($data_nodes . ' data_nodes file not found.'); | 
|  | 153   } | 
|  | 154   if(-e $data_names){ | 
|  | 155     $self->{_data}->{names} = $data_names; | 
|  | 156   } | 
|  | 157   else{ | 
|  | 158     $logger->logdie($data_names . ' data_names file not found.'); | 
|  | 159   } | 
|  | 160 } | 
|  | 161 | 
|  | 162 | 
|  | 163 sub help { | 
|  | 164 my $prog = basename($0); | 
|  | 165 print STDERR <<EOF ; | 
|  | 166 #### $prog #### | 
|  | 167 # | 
|  | 168 # AUTHOR:     Sebastien THEIL | 
|  | 169 # LAST MODIF: 19/09/2015 | 
|  | 170 # PURPOSE:    This script is used to load NCBI taxonomy file into a SQLite database. | 
|  | 171 | 
|  | 172 USAGE: | 
|  | 173       $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... | 
|  | 174 | 
|  | 175 			### OPTIONS ### | 
|  | 176       -nucl       <string>   gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) | 
|  | 177       -prot       <string>   gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) | 
|  | 178       -names      <string>   names.dmp file. (Default: $data_names) | 
|  | 179       -nodes      <string>   nodes.dmp file. (Default: $data_nodes) | 
|  | 180       -v          <int>      Verbosity level. (0 -> 4). | 
|  | 181 EOF | 
|  | 182 exit(1); | 
|  | 183 } |