Mercurial > repos > stheil > taxonomy_sqlite
comparison perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip
Uploaded
author | stheil |
---|---|
date | Mon, 26 Oct 2015 10:59:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4ecb2ce50254 |
---|---|
1 #!/usr/bin/perl | |
2 use strict ; | |
3 use File::Basename; | |
4 use Data::Dumper; | |
5 use Getopt::Long; | |
6 use Logger::Logger; | |
7 use Tools::Taxonomy; | |
8 use DBI; | |
9 use SQL::SplitStatement; | |
10 | |
11 | |
12 my $taxo_struct_dmp = 'taxonomyStructure.sql'; | |
13 my $taxo_index_dmp = 'taxonomyIndex.sql'; | |
14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; | |
15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; | |
16 my $data_nodes = 'nodes.dmp'; | |
17 my $data_names = 'names.dmp'; | |
18 my $verbosity=1; | |
19 | |
20 | |
21 GetOptions( | |
22 "nucl=s" => \$data_gi_taxid_nucl, | |
23 "prot=s" => \$data_gi_taxid_prot, | |
24 "names=s" => \$data_names, | |
25 "nodes=s" => \$data_nodes, | |
26 "v=i" => \$verbosity, | |
27 ); | |
28 | |
29 | |
30 Logger::Logger->changeMode($verbosity); | |
31 | |
32 | |
33 &main; | |
34 | |
35 | |
36 sub main { | |
37 my $self={}; | |
38 bless $self; | |
39 $self->_set_options(); | |
40 | |
41 $self->{_sep} = { names => '\t\|\t|\t\|$', | |
42 nodes => '\t\|\t|\t\|$', | |
43 gi_taxid_prot => '\t', | |
44 gi_taxid_nucl => '\t', | |
45 }; | |
46 | |
47 $self->_create_sqlite_db('taxonomy.sqlite'); | |
48 my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); | |
49 | |
50 $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); | |
51 $dbh->disconnect; | |
52 } | |
53 | |
54 sub _insertingCSVDataInDatabase { | |
55 my ($self,$dbh,$tablesDataFiles) = @_; | |
56 $logger->info('Inserting tables into database...'); | |
57 foreach my $table (keys %{$tablesDataFiles}){ | |
58 $logger->info($table); | |
59 my $sth = $dbh->column_info( undef, undef, $table, '%'); | |
60 my $ref = $sth->fetchall_arrayref; | |
61 my @cols = map { $_->[3] } @$ref; | |
62 | |
63 $logger->debug("Inserting data in table $table ...\n"); | |
64 $dbh->{AutoCommit} = 0; | |
65 $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); | |
66 | |
67 my $separator = "\t"; | |
68 if(defined $self->{_sep}->{$table}){ | |
69 $separator = $self->{_sep}->{$table}; | |
70 } | |
71 open (DATA, $tablesDataFiles->{$table}); | |
72 | |
73 while (<DATA>) { | |
74 chomp; | |
75 $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); | |
76 } | |
77 close DATA; | |
78 | |
79 $dbh->commit or $logger->logdie($dbh->errstr); | |
80 $logger->debug("Insertion of data in table $table finished\n"); | |
81 } | |
82 } | |
83 | |
84 | |
85 sub _create_sqlite_db { | |
86 my ($self,$file) = @_; | |
87 $logger->info('Creating database.'); | |
88 if(! -e $file){ | |
89 `touch $file`; | |
90 my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); | |
91 $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); | |
92 $dbh->disconnect; | |
93 } | |
94 else{ | |
95 $logger->warn('Database already exists. Skip...') | |
96 } | |
97 } | |
98 | |
99 | |
100 sub _executeSQLFiles { | |
101 my ($self,$dbh,@sqlFiles) = @_; | |
102 my $sql_splitter = SQL::SplitStatement->new; | |
103 foreach my $file (@sqlFiles){ | |
104 $logger->debug('Reading sql file:' . $file); | |
105 my $cmd; | |
106 open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); | |
107 while( <FILE> ){ | |
108 $cmd.= $_; | |
109 } | |
110 close FILE; | |
111 | |
112 my @statements = $sql_splitter->split($cmd); | |
113 foreach (@statements){ | |
114 $logger->debug('Executing sql cmd:'); | |
115 $logger->debug($_); | |
116 $dbh-> do($_) or $logger->logdie($dbh->errstr); | |
117 } | |
118 } | |
119 } | |
120 | |
121 | |
122 sub _set_options { | |
123 my ($self)=@_; | |
124 if(-e $taxo_struct_dmp){ | |
125 $self->{_taxo_struct_dmp} = $taxo_struct_dmp; | |
126 } | |
127 else{ | |
128 $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); | |
129 } | |
130 if(-e $taxo_index_dmp){ | |
131 $self->{_taxo_index_dmp} = $taxo_index_dmp; | |
132 } | |
133 else{ | |
134 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | |
135 } | |
136 if(-e $data_gi_taxid_nucl){ | |
137 $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; | |
138 } | |
139 else{ | |
140 $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); | |
141 } | |
142 if(-e $data_gi_taxid_prot){ | |
143 $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; | |
144 } | |
145 else{ | |
146 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | |
147 } | |
148 if(-e $data_nodes){ | |
149 $self->{_data}->{nodes} = $data_nodes; | |
150 } | |
151 else{ | |
152 $logger->logdie($data_nodes . ' data_nodes file not found.'); | |
153 } | |
154 if(-e $data_names){ | |
155 $self->{_data}->{names} = $data_names; | |
156 } | |
157 else{ | |
158 $logger->logdie($data_names . ' data_names file not found.'); | |
159 } | |
160 } | |
161 | |
162 | |
163 sub help { | |
164 my $prog = basename($0); | |
165 print STDERR <<EOF ; | |
166 #### $prog #### | |
167 # | |
168 # AUTHOR: Sebastien THEIL | |
169 # LAST MODIF: 19/09/2015 | |
170 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database. | |
171 | |
172 USAGE: | |
173 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... | |
174 | |
175 ### OPTIONS ### | |
176 -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) | |
177 -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) | |
178 -names <string> names.dmp file. (Default: $data_names) | |
179 -nodes <string> nodes.dmp file. (Default: $data_nodes) | |
180 -v <int> Verbosity level. (0 -> 4). | |
181 EOF | |
182 exit(1); | |
183 } |