comparison perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip

Uploaded
author stheil
date Mon, 26 Oct 2015 10:59:07 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4ecb2ce50254
1 #!/usr/bin/perl
2 use strict ;
3 use File::Basename;
4 use Data::Dumper;
5 use Getopt::Long;
6 use Logger::Logger;
7 use Tools::Taxonomy;
8 use DBI;
9 use SQL::SplitStatement;
10
11
12 my $taxo_struct_dmp = 'taxonomyStructure.sql';
13 my $taxo_index_dmp = 'taxonomyIndex.sql';
14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
16 my $data_nodes = 'nodes.dmp';
17 my $data_names = 'names.dmp';
18 my $verbosity=1;
19
20
21 GetOptions(
22 "nucl=s" => \$data_gi_taxid_nucl,
23 "prot=s" => \$data_gi_taxid_prot,
24 "names=s" => \$data_names,
25 "nodes=s" => \$data_nodes,
26 "v=i" => \$verbosity,
27 );
28
29
30 Logger::Logger->changeMode($verbosity);
31
32
33 &main;
34
35
36 sub main {
37 my $self={};
38 bless $self;
39 $self->_set_options();
40
41 $self->{_sep} = { names => '\t\|\t|\t\|$',
42 nodes => '\t\|\t|\t\|$',
43 gi_taxid_prot => '\t',
44 gi_taxid_nucl => '\t',
45 };
46
47 $self->_create_sqlite_db('taxonomy.sqlite');
48 my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");
49
50 $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
51 $dbh->disconnect;
52 }
53
54 sub _insertingCSVDataInDatabase {
55 my ($self,$dbh,$tablesDataFiles) = @_;
56 $logger->info('Inserting tables into database...');
57 foreach my $table (keys %{$tablesDataFiles}){
58 $logger->info($table);
59 my $sth = $dbh->column_info( undef, undef, $table, '%');
60 my $ref = $sth->fetchall_arrayref;
61 my @cols = map { $_->[3] } @$ref;
62
63 $logger->debug("Inserting data in table $table ...\n");
64 $dbh->{AutoCommit} = 0;
65 $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);
66
67 my $separator = "\t";
68 if(defined $self->{_sep}->{$table}){
69 $separator = $self->{_sep}->{$table};
70 }
71 open (DATA, $tablesDataFiles->{$table});
72
73 while (<DATA>) {
74 chomp;
75 $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
76 }
77 close DATA;
78
79 $dbh->commit or $logger->logdie($dbh->errstr);
80 $logger->debug("Insertion of data in table $table finished\n");
81 }
82 }
83
84
85 sub _create_sqlite_db {
86 my ($self,$file) = @_;
87 $logger->info('Creating database.');
88 if(! -e $file){
89 `touch $file`;
90 my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
91 $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
92 $dbh->disconnect;
93 }
94 else{
95 $logger->warn('Database already exists. Skip...')
96 }
97 }
98
99
100 sub _executeSQLFiles {
101 my ($self,$dbh,@sqlFiles) = @_;
102 my $sql_splitter = SQL::SplitStatement->new;
103 foreach my $file (@sqlFiles){
104 $logger->debug('Reading sql file:' . $file);
105 my $cmd;
106 open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
107 while( <FILE> ){
108 $cmd.= $_;
109 }
110 close FILE;
111
112 my @statements = $sql_splitter->split($cmd);
113 foreach (@statements){
114 $logger->debug('Executing sql cmd:');
115 $logger->debug($_);
116 $dbh-> do($_) or $logger->logdie($dbh->errstr);
117 }
118 }
119 }
120
121
122 sub _set_options {
123 my ($self)=@_;
124 if(-e $taxo_struct_dmp){
125 $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
126 }
127 else{
128 $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
129 }
130 if(-e $taxo_index_dmp){
131 $self->{_taxo_index_dmp} = $taxo_index_dmp;
132 }
133 else{
134 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
135 }
136 if(-e $data_gi_taxid_nucl){
137 $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
138 }
139 else{
140 $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
141 }
142 if(-e $data_gi_taxid_prot){
143 $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
144 }
145 else{
146 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
147 }
148 if(-e $data_nodes){
149 $self->{_data}->{nodes} = $data_nodes;
150 }
151 else{
152 $logger->logdie($data_nodes . ' data_nodes file not found.');
153 }
154 if(-e $data_names){
155 $self->{_data}->{names} = $data_names;
156 }
157 else{
158 $logger->logdie($data_names . ' data_names file not found.');
159 }
160 }
161
162
163 sub help {
164 my $prog = basename($0);
165 print STDERR <<EOF ;
166 #### $prog ####
167 #
168 # AUTHOR: Sebastien THEIL
169 # LAST MODIF: 19/09/2015
170 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database.
171
172 USAGE:
173 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
174
175 ### OPTIONS ###
176 -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
177 -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
178 -names <string> names.dmp file. (Default: $data_names)
179 -nodes <string> nodes.dmp file. (Default: $data_nodes)
180 -v <int> Verbosity level. (0 -> 4).
181 EOF
182 exit(1);
183 }