annotate load_db.py @ 21:357fe86f245d draft

Uploaded
author jjohnson
date Fri, 14 Jul 2017 17:34:22 -0400
parents ab27c4bd14b9
children bed5018e7ae3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
20
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
2
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
3 from __future__ import print_function
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
4
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
5 import sys
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
6
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
7 from filters import TabularReader
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
8
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
9
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
10 def getValueType(val):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
11 if val or 0. == val:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
12 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
13 int(val)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
14 return 'INTEGER'
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
15 except:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
16 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
17 float(val)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
18 return 'REAL'
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
19 except:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
20 return 'TEXT'
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
21 return None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
22
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
23
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
24 def get_column_def(file_path, table_name, skip=0, comment_char='#',
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
25 column_names=None, max_lines=100, load_named_columns=False,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
26 filters=None):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
27 col_pref = ['TEXT', 'REAL', 'INTEGER', None]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
28 col_types = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
29 col_idx = None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
30 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
31 tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
32 col_idx=None, filters=filters)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
33 for linenum, fields in enumerate(tr):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
34 if linenum > max_lines:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
35 break
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
36 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
37 while len(col_types) < len(fields):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
38 col_types.append(None)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
39 for i, val in enumerate(fields):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
40 colType = getValueType(val)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
41 if col_pref.index(colType) < col_pref.index(col_types[i]):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
42 col_types[i] = colType
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
43 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
44 print('Failed at line: %d err: %s' % (linenum, e),
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
45 file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
46 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
47 print('Failed: %s' % (e), file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
48 for i, col_type in enumerate(col_types):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
49 if not col_type:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
50 col_types[i] = 'TEXT'
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
51 if column_names:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
52 col_names = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
53 if load_named_columns:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
54 col_idx = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
55 for i, cname in enumerate(
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
56 [cn.strip() for cn in column_names.split(',')]):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
57 if cname != '':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
58 col_idx.append(i)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
59 col_names.append(cname)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
60 col_types = [col_types[i] for i in col_idx]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
61 else:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
62 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
63 for i, cname in enumerate(
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
64 [cn.strip() for cn in column_names.split(',')]):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
65 if cname and i < len(col_names):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
66 col_names[i] = cname
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
67 else:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
68 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
69 col_def = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
70 for i, col_name in enumerate(col_names):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
71 col_def.append('%s %s' % (col_names[i], col_types[i]))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
72 return col_names, col_types, col_def, col_idx
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
73
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
74
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
75 def create_table(conn, file_path, table_name, skip=0, comment_char='#',
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
76 pkey_autoincr=None, column_names=None,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
77 load_named_columns=False, filters=None,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
78 unique_indexes=[], indexes=[]):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
79 col_names, col_types, col_def, col_idx = \
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
80 get_column_def(file_path, table_name, skip=skip,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
81 comment_char=comment_char, column_names=column_names,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
82 load_named_columns=load_named_columns, filters=filters)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
83 col_func = [float if t == 'REAL' else int
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
84 if t == 'INTEGER' else str for t in col_types]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
85 table_def = 'CREATE TABLE %s (\n %s%s\n);' % (
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
86 table_name,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
87 '%s INTEGER PRIMARY KEY AUTOINCREMENT,' %
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
88 pkey_autoincr if pkey_autoincr else '',
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
89 ', \n '.join(col_def))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
90 # print >> sys.stdout, table_def
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
91 insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % (
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
92 table_name, ','.join(col_names),
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
93 ','.join(["?" for x in col_names]))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
94 # print >> sys.stdout, insert_stmt
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
95 data_lines = 0
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
96 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
97 c = conn.cursor()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
98 c.execute(table_def)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
99 conn.commit()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
100 c.close()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
101 for i, index in enumerate(unique_indexes):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
102 index_name = 'idx_uniq_%s_%d' % (table_name, i)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
103 index_columns = index.split(',')
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
104 create_index(conn, table_name, index_name, index_columns,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
105 unique=True)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
106 for i, index in enumerate(indexes):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
107 index_name = 'idx_%s_%d' % (table_name, i)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
108 index_columns = index.split(',')
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
109 create_index(conn, table_name, index_name, index_columns)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
110 c = conn.cursor()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
111 tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
112 col_idx=col_idx, filters=filters)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
113 for linenum, fields in enumerate(tr):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
114 data_lines += 1
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
115 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
116 vals = [col_func[i](x)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
117 if x else None for i, x in enumerate(fields)]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
118 c.execute(insert_stmt, vals)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
119 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
120 print('Failed at line: %d err: %s' % (linenum, e),
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
121 file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
122 conn.commit()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
123 c.close()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
124 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
125 print('Failed: %s' % (e), file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
126 exit(1)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
127
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
128
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
129 def create_index(conn, table_name, index_name, index_columns, unique=False):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
130 index_def = "CREATE %s INDEX %s on %s(%s)" % (
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
131 'UNIQUE' if unique else '', index_name,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
132 table_name, ','.join(index_columns))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
133 c = conn.cursor()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
134 c.execute(index_def)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
135 conn.commit()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
136 c.close()