0
|
1 #!/usr/bin/perl
|
|
2 use strict ;
|
|
3 use File::Basename;
|
|
4 use Data::Dumper;
|
|
5 use Getopt::Long;
|
|
6 use Logger::Logger;
|
|
7 use Tools::Taxonomy;
|
|
8 use DBI;
|
|
9 use SQL::SplitStatement;
|
|
10
|
|
11
|
|
12 my $taxo_struct_dmp = 'taxonomyStructure.sql';
|
|
13 my $taxo_index_dmp = 'taxonomyIndex.sql';
|
|
14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
|
|
15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
|
|
16 my $data_nodes = 'nodes.dmp';
|
|
17 my $data_names = 'names.dmp';
|
|
18 my $verbosity=1;
|
|
19
|
|
20
|
|
21 GetOptions(
|
|
22 "nucl=s" => \$data_gi_taxid_nucl,
|
|
23 "prot=s" => \$data_gi_taxid_prot,
|
|
24 "names=s" => \$data_names,
|
|
25 "nodes=s" => \$data_nodes,
|
|
26 "v=i" => \$verbosity,
|
|
27 );
|
|
28
|
|
29
|
|
30 Logger::Logger->changeMode($verbosity);
|
|
31
|
|
32
|
|
33 &main;
|
|
34
|
|
35
|
|
36 sub main {
|
|
37 my $self={};
|
|
38 bless $self;
|
|
39 $self->_set_options();
|
|
40
|
|
41 $self->{_sep} = { names => '\t\|\t|\t\|$',
|
|
42 nodes => '\t\|\t|\t\|$',
|
|
43 gi_taxid_prot => '\t',
|
|
44 gi_taxid_nucl => '\t',
|
|
45 };
|
|
46
|
|
47 $self->_create_sqlite_db('taxonomy.sqlite');
|
|
48 my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");
|
|
49
|
|
50 $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
|
|
51 $dbh->disconnect;
|
|
52 }
|
|
53
|
|
54 sub _insertingCSVDataInDatabase {
|
|
55 my ($self,$dbh,$tablesDataFiles) = @_;
|
|
56 $logger->info('Inserting tables into database...');
|
|
57 foreach my $table (keys %{$tablesDataFiles}){
|
|
58 $logger->info($table);
|
|
59 my $sth = $dbh->column_info( undef, undef, $table, '%');
|
|
60 my $ref = $sth->fetchall_arrayref;
|
|
61 my @cols = map { $_->[3] } @$ref;
|
|
62
|
|
63 $logger->debug("Inserting data in table $table ...\n");
|
|
64 $dbh->{AutoCommit} = 0;
|
|
65 $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);
|
|
66
|
|
67 my $separator = "\t";
|
|
68 if(defined $self->{_sep}->{$table}){
|
|
69 $separator = $self->{_sep}->{$table};
|
|
70 }
|
|
71 open (DATA, $tablesDataFiles->{$table});
|
|
72
|
|
73 while (<DATA>) {
|
|
74 chomp;
|
|
75 $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
|
|
76 }
|
|
77 close DATA;
|
|
78
|
|
79 $dbh->commit or $logger->logdie($dbh->errstr);
|
|
80 $logger->debug("Insertion of data in table $table finished\n");
|
|
81 }
|
|
82 }
|
|
83
|
|
84
|
|
85 sub _create_sqlite_db {
|
|
86 my ($self,$file) = @_;
|
|
87 $logger->info('Creating database.');
|
|
88 if(! -e $file){
|
|
89 `touch $file`;
|
|
90 my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
|
|
91 $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
|
|
92 $dbh->disconnect;
|
|
93 }
|
|
94 else{
|
|
95 $logger->warn('Database already exists. Skip...')
|
|
96 }
|
|
97 }
|
|
98
|
|
99
|
|
100 sub _executeSQLFiles {
|
|
101 my ($self,$dbh,@sqlFiles) = @_;
|
|
102 my $sql_splitter = SQL::SplitStatement->new;
|
|
103 foreach my $file (@sqlFiles){
|
|
104 $logger->debug('Reading sql file:' . $file);
|
|
105 my $cmd;
|
|
106 open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
|
|
107 while( <FILE> ){
|
|
108 $cmd.= $_;
|
|
109 }
|
|
110 close FILE;
|
|
111
|
|
112 my @statements = $sql_splitter->split($cmd);
|
|
113 foreach (@statements){
|
|
114 $logger->debug('Executing sql cmd:');
|
|
115 $logger->debug($_);
|
|
116 $dbh-> do($_) or $logger->logdie($dbh->errstr);
|
|
117 }
|
|
118 }
|
|
119 }
|
|
120
|
|
121
|
|
122 sub _set_options {
|
|
123 my ($self)=@_;
|
|
124 if(-e $taxo_struct_dmp){
|
|
125 $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
|
|
126 }
|
|
127 else{
|
|
128 $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
|
|
129 }
|
|
130 if(-e $taxo_index_dmp){
|
|
131 $self->{_taxo_index_dmp} = $taxo_index_dmp;
|
|
132 }
|
|
133 else{
|
|
134 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
|
|
135 }
|
|
136 if(-e $data_gi_taxid_nucl){
|
|
137 $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
|
|
138 }
|
|
139 else{
|
|
140 $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
|
|
141 }
|
|
142 if(-e $data_gi_taxid_prot){
|
|
143 $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
|
|
144 }
|
|
145 else{
|
|
146 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
|
|
147 }
|
|
148 if(-e $data_nodes){
|
|
149 $self->{_data}->{nodes} = $data_nodes;
|
|
150 }
|
|
151 else{
|
|
152 $logger->logdie($data_nodes . ' data_nodes file not found.');
|
|
153 }
|
|
154 if(-e $data_names){
|
|
155 $self->{_data}->{names} = $data_names;
|
|
156 }
|
|
157 else{
|
|
158 $logger->logdie($data_names . ' data_names file not found.');
|
|
159 }
|
|
160 }
|
|
161
|
|
162
|
|
163 sub help {
|
|
164 my $prog = basename($0);
|
|
165 print STDERR <<EOF ;
|
|
166 #### $prog ####
|
|
167 #
|
|
168 # AUTHOR: Sebastien THEIL
|
|
169 # LAST MODIF: 19/09/2015
|
|
170 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database.
|
|
171
|
|
172 USAGE:
|
|
173 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
|
|
174
|
|
175 ### OPTIONS ###
|
|
176 -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
|
|
177 -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
|
|
178 -names <string> names.dmp file. (Default: $data_names)
|
|
179 -nodes <string> nodes.dmp file. (Default: $data_nodes)
|
|
180 -v <int> Verbosity level. (0 -> 4).
|
|
181 EOF
|
|
182 exit(1);
|
|
183 }
|