Mercurial > repos > iuc > sqlite_to_tabular
comparison load_db.py @ 1:c1b700bc0150 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 81f69ad5f39223059c40501e55ac777d3feca845
author | iuc |
---|---|
date | Fri, 18 Aug 2017 16:48:20 -0400 |
parents | 859064f07be4 |
children | 150765965caa |
comparison
equal
deleted
inserted
replaced
0:859064f07be4 | 1:c1b700bc0150 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 from __future__ import print_function | 3 from __future__ import print_function |
4 | 4 |
5 import re | |
5 import sys | 6 import sys |
6 | 7 |
7 from filters import TabularReader | 8 from filters import TabularReader |
9 | |
10 | |
11 SQLITE_KEYWORDS = [ | |
12 'ABORT', | |
13 'ACTION', | |
14 'ADD', | |
15 'AFTER', | |
16 'ALL', | |
17 'ALTER', | |
18 'ANALYZE', | |
19 'AND', | |
20 'AS', | |
21 'ASC', | |
22 'ATTACH', | |
23 'AUTOINCREMENT', | |
24 'BEFORE', | |
25 'BEGIN', | |
26 'BETWEEN', | |
27 'BY', | |
28 'CASCADE', | |
29 'CASE', | |
30 'CAST', | |
31 'CHECK', | |
32 'COLLATE', | |
33 'COLUMN', | |
34 'COMMIT', | |
35 'CONFLICT', | |
36 'CONSTRAINT', | |
37 'CREATE', | |
38 'CROSS', | |
39 'CURRENT_DATE', | |
40 'CURRENT_TIME', | |
41 'CURRENT_TIMESTAMP', | |
42 'DATABASE', | |
43 'DEFAULT', | |
44 'DEFERRABLE', | |
45 'DEFERRED', | |
46 'DELETE', | |
47 'DESC', | |
48 'DETACH', | |
49 'DISTINCT', | |
50 'DROP', | |
51 'EACH', | |
52 'ELSE', | |
53 'END', | |
54 'ESCAPE', | |
55 'EXCEPT', | |
56 'EXCLUSIVE', | |
57 'EXISTS', | |
58 'EXPLAIN', | |
59 'FAIL', | |
60 'FOR', | |
61 'FOREIGN', | |
62 'FROM', | |
63 'FULL', | |
64 'GLOB', | |
65 'GROUP', | |
66 'HAVING', | |
67 'IF', | |
68 'IGNORE', | |
69 'IMMEDIATE', | |
70 'IN', | |
71 'INDEX', | |
72 'INDEXED', | |
73 'INITIALLY', | |
74 'INNER', | |
75 'INSERT', | |
76 'INSTEAD', | |
77 'INTERSECT', | |
78 'INTO', | |
79 'IS', | |
80 'ISNULL', | |
81 'JOIN', | |
82 'KEY', | |
83 'LEFT', | |
84 'LIKE', | |
85 'LIMIT', | |
86 'MATCH', | |
87 'NATURAL', | |
88 'NO', | |
89 'NOT', | |
90 'NOTNULL', | |
91 'NULL', | |
92 'OF', | |
93 'OFFSET', | |
94 'ON', | |
95 'OR', | |
96 'ORDER', | |
97 'OUTER', | |
98 'PLAN', | |
99 'PRAGMA', | |
100 'PRIMARY', | |
101 'QUERY', | |
102 'RAISE', | |
103 'RECURSIVE', | |
104 'REFERENCES', | |
105 'REGEXP', | |
106 'REINDEX', | |
107 'RELEASE', | |
108 'RENAME', | |
109 'REPLACE', | |
110 'RESTRICT', | |
111 'RIGHT', | |
112 'ROLLBACK', | |
113 'ROW', | |
114 'SAVEPOINT', | |
115 'SELECT', | |
116 'SET', | |
117 'TABLE', | |
118 'TEMP', | |
119 'TEMPORARY', | |
120 'THEN', | |
121 'TO', | |
122 'TRANSACTION', | |
123 'TRIGGER', | |
124 'UNION', | |
125 'UNIQUE', | |
126 'UPDATE', | |
127 'USING', | |
128 'VACUUM', | |
129 'VALUES', | |
130 'VIEW', | |
131 'VIRTUAL', | |
132 'WHEN', | |
133 'WHERE', | |
134 'WITH', | |
135 'WITHOUT' | |
136 ] | |
137 | |
138 | |
139 def get_valid_column_name(name): | |
140 valid_name = name | |
141 if not name or not name.strip(): | |
142 return None | |
143 elif name.upper() in SQLITE_KEYWORDS: | |
144 valid_name = '"%s"' % name | |
145 elif re.match('^[a-zA-Z]\w*$', name): | |
146 pass | |
147 elif re.match('^"[^"]+"$', name): | |
148 pass | |
149 elif re.match('^\[[^\[\]]*\]$', name): | |
150 pass | |
151 elif re.match("^`[^`]+`$", name): | |
152 pass | |
153 elif name.find('"') < 0: | |
154 valid_name = '"%s"' % name | |
155 elif name.find('[') < 0 and name.find(']') < 0: | |
156 valid_name = '[%s]' % name | |
157 elif name.find('`') < 0: | |
158 valid_name = '`%s`' % name | |
159 elif name.find("'") < 0: | |
160 valid_name = "'%s'" % name | |
161 return valid_name | |
8 | 162 |
9 | 163 |
10 def getValueType(val): | 164 def getValueType(val): |
11 if val or 0. == val: | 165 if val or 0. == val: |
12 try: | 166 try: |
21 return None | 175 return None |
22 | 176 |
23 | 177 |
24 def get_column_def(file_path, table_name, skip=0, comment_char='#', | 178 def get_column_def(file_path, table_name, skip=0, comment_char='#', |
25 column_names=None, max_lines=100, load_named_columns=False, | 179 column_names=None, max_lines=100, load_named_columns=False, |
26 filters=None): | 180 firstlinenames=False, filters=None): |
27 col_pref = ['TEXT', 'REAL', 'INTEGER', None] | 181 col_pref = ['TEXT', 'REAL', 'INTEGER', None] |
28 col_types = [] | 182 col_types = [] |
29 col_idx = None | 183 col_idx = None |
184 col_names = [] | |
30 try: | 185 try: |
31 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, | 186 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, |
32 col_idx=None, filters=filters) | 187 col_idx=None, filters=filters) |
33 for linenum, fields in enumerate(tr): | 188 for linenum, fields in enumerate(tr): |
189 if linenum == 0 and firstlinenames: | |
190 col_names = [get_valid_column_name(name) or 'c%d' % (i + 1) | |
191 for i, name in enumerate(fields)] | |
192 continue | |
34 if linenum > max_lines: | 193 if linenum > max_lines: |
35 break | 194 break |
36 try: | 195 try: |
37 while len(col_types) < len(fields): | 196 while len(col_types) < len(fields): |
38 col_types.append(None) | 197 col_types.append(None) |
46 except Exception as e: | 205 except Exception as e: |
47 print('Failed: %s' % (e), file=sys.stderr) | 206 print('Failed: %s' % (e), file=sys.stderr) |
48 for i, col_type in enumerate(col_types): | 207 for i, col_type in enumerate(col_types): |
49 if not col_type: | 208 if not col_type: |
50 col_types[i] = 'TEXT' | 209 col_types[i] = 'TEXT' |
210 if not col_names: | |
211 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
51 if column_names: | 212 if column_names: |
52 col_names = [] | |
53 if load_named_columns: | 213 if load_named_columns: |
54 col_idx = [] | 214 col_idx = [] |
215 cnames = [] | |
55 for i, cname in enumerate( | 216 for i, cname in enumerate( |
56 [cn.strip() for cn in column_names.split(',')]): | 217 [cn.strip() for cn in column_names.split(',')]): |
57 if cname != '': | 218 if cname != '': |
58 col_idx.append(i) | 219 col_idx.append(i) |
59 col_names.append(cname) | 220 cnames.append(cname) |
60 col_types = [col_types[i] for i in col_idx] | 221 col_types = [col_types[i] for i in col_idx] |
222 col_names = cnames | |
61 else: | 223 else: |
62 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
63 for i, cname in enumerate( | 224 for i, cname in enumerate( |
64 [cn.strip() for cn in column_names.split(',')]): | 225 [cn.strip() for cn in column_names.split(',')]): |
65 if cname and i < len(col_names): | 226 if cname and i < len(col_names): |
66 col_names[i] = cname | 227 col_names[i] = cname |
67 else: | |
68 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
69 col_def = [] | 228 col_def = [] |
70 for i, col_name in enumerate(col_names): | 229 for i, col_name in enumerate(col_names): |
71 col_def.append('%s %s' % (col_names[i], col_types[i])) | 230 col_def.append('%s %s' % (col_names[i], col_types[i])) |
72 return col_names, col_types, col_def, col_idx | 231 return col_names, col_types, col_def, col_idx |
73 | 232 |
74 | 233 |
75 def create_table(conn, file_path, table_name, skip=0, comment_char='#', | 234 def create_table(conn, file_path, table_name, skip=0, comment_char='#', |
76 pkey_autoincr=None, column_names=None, | 235 pkey_autoincr=None, column_names=None, |
77 load_named_columns=False, filters=None, | 236 load_named_columns=False, firstlinenames=False, |
78 unique_indexes=[], indexes=[]): | 237 filters=None, unique_indexes=[], indexes=[]): |
79 col_names, col_types, col_def, col_idx = \ | 238 col_names, col_types, col_def, col_idx = \ |
80 get_column_def(file_path, table_name, skip=skip, | 239 get_column_def(file_path, table_name, skip=skip, |
81 comment_char=comment_char, column_names=column_names, | 240 comment_char=comment_char, column_names=column_names, |
82 load_named_columns=load_named_columns, filters=filters) | 241 load_named_columns=load_named_columns, |
242 firstlinenames=firstlinenames, | |
243 filters=filters) | |
83 col_func = [float if t == 'REAL' else int | 244 col_func = [float if t == 'REAL' else int |
84 if t == 'INTEGER' else str for t in col_types] | 245 if t == 'INTEGER' else str for t in col_types] |
85 table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( | 246 table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( |
86 table_name, | 247 table_name, |
87 '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % | 248 '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % |
96 try: | 257 try: |
97 c = conn.cursor() | 258 c = conn.cursor() |
98 c.execute(table_def) | 259 c.execute(table_def) |
99 conn.commit() | 260 conn.commit() |
100 c.close() | 261 c.close() |
101 for i, index in enumerate(unique_indexes): | 262 |
102 index_name = 'idx_uniq_%s_%d' % (table_name, i) | |
103 index_columns = index.split(',') | |
104 create_index(conn, table_name, index_name, index_columns, | |
105 unique=True) | |
106 for i, index in enumerate(indexes): | |
107 index_name = 'idx_%s_%d' % (table_name, i) | |
108 index_columns = index.split(',') | |
109 create_index(conn, table_name, index_name, index_columns) | |
110 c = conn.cursor() | 263 c = conn.cursor() |
111 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, | 264 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, |
112 col_idx=col_idx, filters=filters) | 265 col_idx=col_idx, filters=filters) |
113 for linenum, fields in enumerate(tr): | 266 for linenum, fields in enumerate(tr): |
267 if linenum == 0 and firstlinenames: | |
268 continue | |
114 data_lines += 1 | 269 data_lines += 1 |
115 try: | 270 try: |
116 vals = [col_func[i](x) | 271 vals = [col_func[i](x) |
117 if x else None for i, x in enumerate(fields)] | 272 if x else None for i, x in enumerate(fields)] |
118 c.execute(insert_stmt, vals) | 273 c.execute(insert_stmt, vals) |
119 except Exception as e: | 274 except Exception as e: |
120 print('Failed at line: %d err: %s' % (linenum, e), | 275 print('Failed at line: %d err: %s' % (linenum, e), |
121 file=sys.stderr) | 276 file=sys.stderr) |
122 conn.commit() | 277 conn.commit() |
123 c.close() | 278 c.close() |
279 for i, index in enumerate(unique_indexes): | |
280 index_name = 'idx_uniq_%s_%d' % (table_name, i) | |
281 index_columns = index.split(',') | |
282 create_index(conn, table_name, index_name, index_columns, | |
283 unique=True) | |
284 for i, index in enumerate(indexes): | |
285 index_name = 'idx_%s_%d' % (table_name, i) | |
286 index_columns = index.split(',') | |
287 create_index(conn, table_name, index_name, index_columns) | |
124 except Exception as e: | 288 except Exception as e: |
125 exit('Error: %s' % (e)) | 289 exit('Error: %s' % (e)) |
126 | 290 |
127 | 291 |
128 def create_index(conn, table_name, index_name, index_columns, unique=False): | 292 def create_index(conn, table_name, index_name, index_columns, unique=False): |
129 index_def = "CREATE %s INDEX %s on %s(%s)" % ( | 293 index_def = 'CREATE %s INDEX %s on %s(%s)' % ( |
130 'UNIQUE' if unique else '', index_name, | 294 'UNIQUE' if unique else '', index_name, |
131 table_name, ','.join(index_columns)) | 295 table_name, ','.join(index_columns)) |
132 c = conn.cursor() | 296 try: |
133 c.execute(index_def) | 297 c = conn.cursor() |
134 conn.commit() | 298 c.execute(index_def) |
135 c.close() | 299 conn.commit() |
300 c.close() | |
301 except Exception as e: | |
302 print('Failed: %s err: %s' % (index_def, e), file=sys.stderr) | |
303 raise(e) |