annotate perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip

Uploaded
author stheil
date Mon, 26 Oct 2015 10:59:07 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
1 #!/usr/bin/perl
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
2 use strict ;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
3 use File::Basename;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
4 use Data::Dumper;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
5 use Getopt::Long;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
6 use Logger::Logger;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
7 use Tools::Taxonomy;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
8 use DBI;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
9 use SQL::SplitStatement;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
10
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
11
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
12 my $taxo_struct_dmp = 'taxonomyStructure.sql';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
13 my $taxo_index_dmp = 'taxonomyIndex.sql';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
16 my $data_nodes = 'nodes.dmp';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
17 my $data_names = 'names.dmp';
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
18 my $verbosity=1;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
19
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
20
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
21 GetOptions(
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
22 "nucl=s" => \$data_gi_taxid_nucl,
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
23 "prot=s" => \$data_gi_taxid_prot,
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
24 "names=s" => \$data_names,
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
25 "nodes=s" => \$data_nodes,
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
26 "v=i" => \$verbosity,
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
27 );
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
28
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
29
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
30 Logger::Logger->changeMode($verbosity);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
31
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
32
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
33 &main;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
34
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
35
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
36 sub main {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
37 my $self={};
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
38 bless $self;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
39 $self->_set_options();
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
40
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
41 $self->{_sep} = { names => '\t\|\t|\t\|$',
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
42 nodes => '\t\|\t|\t\|$',
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
43 gi_taxid_prot => '\t',
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
44 gi_taxid_nucl => '\t',
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
45 };
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
46
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
47 $self->_create_sqlite_db('taxonomy.sqlite');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
48 my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","","");
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
49
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
50 $self->_insertingCSVDataInDatabase($dbh,$self->{_data});
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
51 $dbh->disconnect;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
52 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
53
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
54 sub _insertingCSVDataInDatabase {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
55 my ($self,$dbh,$tablesDataFiles) = @_;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
56 $logger->info('Inserting tables into database...');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
57 foreach my $table (keys %{$tablesDataFiles}){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
58 $logger->info($table);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
59 my $sth = $dbh->column_info( undef, undef, $table, '%');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
60 my $ref = $sth->fetchall_arrayref;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
61 my @cols = map { $_->[3] } @$ref;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
62
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
63 $logger->debug("Inserting data in table $table ...\n");
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
64 $dbh->{AutoCommit} = 0;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
65 $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
66
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
67 my $separator = "\t";
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
68 if(defined $self->{_sep}->{$table}){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
69 $separator = $self->{_sep}->{$table};
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
70 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
71 open (DATA, $tablesDataFiles->{$table});
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
72
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
73 while (<DATA>) {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
74 chomp;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
75 $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
76 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
77 close DATA;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
78
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
79 $dbh->commit or $logger->logdie($dbh->errstr);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
80 $logger->debug("Insertion of data in table $table finished\n");
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
81 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
82 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
83
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
84
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
85 sub _create_sqlite_db {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
86 my ($self,$file) = @_;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
87 $logger->info('Creating database.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
88 if(! -e $file){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
89 `touch $file`;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
90 my $dbh = DBI->connect("dbi:SQLite:dbname=$file","","");
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
91 $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp}));
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
92 $dbh->disconnect;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
93 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
94 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
95 $logger->warn('Database already exists. Skip...')
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
96 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
97 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
98
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
99
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
100 sub _executeSQLFiles {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
101 my ($self,$dbh,@sqlFiles) = @_;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
102 my $sql_splitter = SQL::SplitStatement->new;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
103 foreach my $file (@sqlFiles){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
104 $logger->debug('Reading sql file:' . $file);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
105 my $cmd;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
106 open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n");
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
107 while( <FILE> ){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
108 $cmd.= $_;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
109 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
110 close FILE;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
111
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
112 my @statements = $sql_splitter->split($cmd);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
113 foreach (@statements){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
114 $logger->debug('Executing sql cmd:');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
115 $logger->debug($_);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
116 $dbh-> do($_) or $logger->logdie($dbh->errstr);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
117 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
118 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
119 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
120
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
121
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
122 sub _set_options {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
123 my ($self)=@_;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
124 if(-e $taxo_struct_dmp){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
125 $self->{_taxo_struct_dmp} = $taxo_struct_dmp;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
126 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
127 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
128 $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
129 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
130 if(-e $taxo_index_dmp){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
131 $self->{_taxo_index_dmp} = $taxo_index_dmp;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
132 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
133 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
134 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
135 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
136 if(-e $data_gi_taxid_nucl){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
137 $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
138 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
139 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
140 $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
141 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
142 if(-e $data_gi_taxid_prot){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
143 $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
144 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
145 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
146 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
147 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
148 if(-e $data_nodes){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
149 $self->{_data}->{nodes} = $data_nodes;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
150 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
151 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
152 $logger->logdie($data_nodes . ' data_nodes file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
153 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
154 if(-e $data_names){
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
155 $self->{_data}->{names} = $data_names;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
156 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
157 else{
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
158 $logger->logdie($data_names . ' data_names file not found.');
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
159 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
160 }
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
161
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
162
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
163 sub help {
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
164 my $prog = basename($0);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
165 print STDERR <<EOF ;
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
166 #### $prog ####
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
167 #
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
168 # AUTHOR: Sebastien THEIL
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
169 # LAST MODIF: 19/09/2015
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
170 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database.
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
171
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
172 USAGE:
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
173 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
174
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
175 ### OPTIONS ###
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
176 -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl)
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
177 -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot)
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
178 -names <string> names.dmp file. (Default: $data_names)
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
179 -nodes <string> nodes.dmp file. (Default: $data_nodes)
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
180 -v <int> Verbosity level. (0 -> 4).
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
181 EOF
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
182 exit(1);
4ecb2ce50254 Uploaded
stheil
parents:
diff changeset
183 }