Mercurial > repos > stheil > taxonomy_sqlite
comparison perl/scripts/loadTaxonomy.pl @ 0:4ecb2ce50254 draft default tip
Uploaded
| author | stheil |
|---|---|
| date | Mon, 26 Oct 2015 10:59:07 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4ecb2ce50254 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 use strict ; | |
| 3 use File::Basename; | |
| 4 use Data::Dumper; | |
| 5 use Getopt::Long; | |
| 6 use Logger::Logger; | |
| 7 use Tools::Taxonomy; | |
| 8 use DBI; | |
| 9 use SQL::SplitStatement; | |
| 10 | |
| 11 | |
| 12 my $taxo_struct_dmp = 'taxonomyStructure.sql'; | |
| 13 my $taxo_index_dmp = 'taxonomyIndex.sql'; | |
| 14 my $data_gi_taxid_nucl = 'gi_taxid_nucl.dmp'; | |
| 15 my $data_gi_taxid_prot = 'gi_taxid_prot.dmp'; | |
| 16 my $data_nodes = 'nodes.dmp'; | |
| 17 my $data_names = 'names.dmp'; | |
| 18 my $verbosity=1; | |
| 19 | |
| 20 | |
| 21 GetOptions( | |
| 22 "nucl=s" => \$data_gi_taxid_nucl, | |
| 23 "prot=s" => \$data_gi_taxid_prot, | |
| 24 "names=s" => \$data_names, | |
| 25 "nodes=s" => \$data_nodes, | |
| 26 "v=i" => \$verbosity, | |
| 27 ); | |
| 28 | |
| 29 | |
| 30 Logger::Logger->changeMode($verbosity); | |
| 31 | |
| 32 | |
| 33 &main; | |
| 34 | |
| 35 | |
| 36 sub main { | |
| 37 my $self={}; | |
| 38 bless $self; | |
| 39 $self->_set_options(); | |
| 40 | |
| 41 $self->{_sep} = { names => '\t\|\t|\t\|$', | |
| 42 nodes => '\t\|\t|\t\|$', | |
| 43 gi_taxid_prot => '\t', | |
| 44 gi_taxid_nucl => '\t', | |
| 45 }; | |
| 46 | |
| 47 $self->_create_sqlite_db('taxonomy.sqlite'); | |
| 48 my $dbh = DBI->connect("dbi:SQLite:dbname=taxonomy_tmp.sqlite","",""); | |
| 49 | |
| 50 $self->_insertingCSVDataInDatabase($dbh,$self->{_data}); | |
| 51 $dbh->disconnect; | |
| 52 } | |
| 53 | |
| 54 sub _insertingCSVDataInDatabase { | |
| 55 my ($self,$dbh,$tablesDataFiles) = @_; | |
| 56 $logger->info('Inserting tables into database...'); | |
| 57 foreach my $table (keys %{$tablesDataFiles}){ | |
| 58 $logger->info($table); | |
| 59 my $sth = $dbh->column_info( undef, undef, $table, '%'); | |
| 60 my $ref = $sth->fetchall_arrayref; | |
| 61 my @cols = map { $_->[3] } @$ref; | |
| 62 | |
| 63 $logger->debug("Inserting data in table $table ...\n"); | |
| 64 $dbh->{AutoCommit} = 0; | |
| 65 $sth = $dbh->prepare( "INSERT OR IGNORE INTO $table ( ".join(',', map {"'".$_."'"} @cols)." ) VALUES (".join(',', map {'?'} @cols).")" ) or $logger->logdie($dbh->errstr); | |
| 66 | |
| 67 my $separator = "\t"; | |
| 68 if(defined $self->{_sep}->{$table}){ | |
| 69 $separator = $self->{_sep}->{$table}; | |
| 70 } | |
| 71 open (DATA, $tablesDataFiles->{$table}); | |
| 72 | |
| 73 while (<DATA>) { | |
| 74 chomp; | |
| 75 $sth->execute(grep {$_ !~ /^$separator$/} split (/($separator)/, $_)) or $logger->logdie($dbh->errstr); | |
| 76 } | |
| 77 close DATA; | |
| 78 | |
| 79 $dbh->commit or $logger->logdie($dbh->errstr); | |
| 80 $logger->debug("Insertion of data in table $table finished\n"); | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 | |
| 85 sub _create_sqlite_db { | |
| 86 my ($self,$file) = @_; | |
| 87 $logger->info('Creating database.'); | |
| 88 if(! -e $file){ | |
| 89 `touch $file`; | |
| 90 my $dbh = DBI->connect("dbi:SQLite:dbname=$file","",""); | |
| 91 $self->_executeSQLFiles($dbh,($self->{_taxo_struct_dmp},$self->{_taxo_index_dmp})); | |
| 92 $dbh->disconnect; | |
| 93 } | |
| 94 else{ | |
| 95 $logger->warn('Database already exists. Skip...') | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 | |
| 100 sub _executeSQLFiles { | |
| 101 my ($self,$dbh,@sqlFiles) = @_; | |
| 102 my $sql_splitter = SQL::SplitStatement->new; | |
| 103 foreach my $file (@sqlFiles){ | |
| 104 $logger->debug('Reading sql file:' . $file); | |
| 105 my $cmd; | |
| 106 open (FILE, $file) or $logger->logdie("Unable to open the SQL file : $file\n"); | |
| 107 while( <FILE> ){ | |
| 108 $cmd.= $_; | |
| 109 } | |
| 110 close FILE; | |
| 111 | |
| 112 my @statements = $sql_splitter->split($cmd); | |
| 113 foreach (@statements){ | |
| 114 $logger->debug('Executing sql cmd:'); | |
| 115 $logger->debug($_); | |
| 116 $dbh-> do($_) or $logger->logdie($dbh->errstr); | |
| 117 } | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 | |
| 122 sub _set_options { | |
| 123 my ($self)=@_; | |
| 124 if(-e $taxo_struct_dmp){ | |
| 125 $self->{_taxo_struct_dmp} = $taxo_struct_dmp; | |
| 126 } | |
| 127 else{ | |
| 128 $logger->logdie($taxo_struct_dmp . ' taxo_struct_dmp file not found.'); | |
| 129 } | |
| 130 if(-e $taxo_index_dmp){ | |
| 131 $self->{_taxo_index_dmp} = $taxo_index_dmp; | |
| 132 } | |
| 133 else{ | |
| 134 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | |
| 135 } | |
| 136 if(-e $data_gi_taxid_nucl){ | |
| 137 $self->{_data}->{gi_taxid_nucl} = $data_gi_taxid_nucl; | |
| 138 } | |
| 139 else{ | |
| 140 $logger->logdie($data_gi_taxid_nucl . ' data_gi_taxid_nucl file not found.'); | |
| 141 } | |
| 142 if(-e $data_gi_taxid_prot){ | |
| 143 $self->{_data}->{gi_taxid_prot} = $data_gi_taxid_prot; | |
| 144 } | |
| 145 else{ | |
| 146 $logger->logdie($data_gi_taxid_prot . ' data_gi_taxid_prot file not found.'); | |
| 147 } | |
| 148 if(-e $data_nodes){ | |
| 149 $self->{_data}->{nodes} = $data_nodes; | |
| 150 } | |
| 151 else{ | |
| 152 $logger->logdie($data_nodes . ' data_nodes file not found.'); | |
| 153 } | |
| 154 if(-e $data_names){ | |
| 155 $self->{_data}->{names} = $data_names; | |
| 156 } | |
| 157 else{ | |
| 158 $logger->logdie($data_names . ' data_names file not found.'); | |
| 159 } | |
| 160 } | |
| 161 | |
| 162 | |
| 163 sub help { | |
| 164 my $prog = basename($0); | |
| 165 print STDERR <<EOF ; | |
| 166 #### $prog #### | |
| 167 # | |
| 168 # AUTHOR: Sebastien THEIL | |
| 169 # LAST MODIF: 19/09/2015 | |
| 170 # PURPOSE: This script is used to load NCBI taxonomy file into a SQLite database. | |
| 171 | |
| 172 USAGE: | |
| 173 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... | |
| 174 | |
| 175 ### OPTIONS ### | |
| 176 -nucl <string> gi_taxid_nucl.dmp file. (Default: $data_gi_taxid_nucl) | |
| 177 -prot <string> gi_taxid_prot.dmp file. (Default: $data_gi_taxid_prot) | |
| 178 -names <string> names.dmp file. (Default: $data_names) | |
| 179 -nodes <string> nodes.dmp file. (Default: $data_nodes) | |
| 180 -v <int> Verbosity level. (0 -> 4). | |
| 181 EOF | |
| 182 exit(1); | |
| 183 } |
