Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/rdflib/tools/csv2rdf.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 """ | |
2 A commandline tool for semi-automatically converting CSV to RDF | |
3 | |
4 try: ``csv2rdf --help`` | |
5 | |
6 """ | |
7 | |
8 from __future__ import print_function | |
9 | |
10 import sys | |
11 import re | |
12 import csv | |
13 import getopt | |
14 import fileinput | |
15 import codecs | |
16 import time | |
17 import datetime | |
18 import warnings | |
19 | |
20 | |
21 import rdflib | |
22 | |
23 from six.moves import configparser | |
24 from six.moves.urllib.parse import quote | |
25 from six import text_type | |
26 | |
27 from rdflib import RDF, RDFS | |
28 from rdflib.namespace import split_uri | |
29 | |
30 __all__ = ['CSV2RDF'] | |
31 | |
32 HELP = """ | |
33 csv2rdf.py \ | |
34 -b <instance-base> \ | |
35 -p <property-base> \ | |
36 [-D <default>] \ | |
37 [-c <classname>] \ | |
38 [-i <identity column(s)>] \ | |
39 [-l <label columns>] \ | |
40 [-s <N>] [-o <output>] \ | |
41 [-f configfile] \ | |
42 [--col<N> <colspec>] \ | |
43 [--prop<N> <property>] \ | |
44 <[-d <delim>] \ | |
45 [-C] [files...]" | |
46 | |
47 Reads csv files from stdin or given files | |
48 if -d is given, use this delimiter | |
49 if -s is given, skips N lines at the start | |
50 Creates a URI from the columns given to -i, or automatically by numbering if | |
51 none is given | |
52 Outputs RDFS labels from the columns given to -l | |
53 if -c is given adds a type triple with the given classname | |
54 if -C is given, the class is defined as rdfs:Class | |
55 Outputs one RDF triple per column in each row. | |
56 Output is in n3 format. | |
57 Output is stdout, unless -o is specified | |
58 | |
59 Long options also supported: \ | |
60 --base, \ | |
61 --propbase, \ | |
62 --ident, \ | |
63 --class, \ | |
64 --label, \ | |
65 --out, \ | |
66 --defineclass | |
67 | |
68 Long options --col0, --col1, ... | |
69 can be used to specify conversion for columns. | |
70 Conversions can be: | |
71 ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format) | |
72 | |
73 Long options --prop0, --prop1, ... | |
74 can be used to use specific properties, rather than ones auto-generated | |
75 from the headers | |
76 | |
77 -D sets the default conversion for columns not listed | |
78 | |
79 -f says to read config from a .ini/config file - the file must contain one | |
80 section called csv2rdf, with keys like the long options, i.e.: | |
81 | |
82 [csv2rdf] | |
83 out=output.n3 | |
84 base=http://example.org/ | |
85 col0=split(";") | |
86 col1=split(";", uri("http://example.org/things/", | |
87 "http://xmlns.com/foaf/0.1/Person")) | |
88 col2=float() | |
89 col3=int() | |
90 col4=date("%Y-%b-%d %H:%M:%S") | |
91 | |
92 """ | |
93 | |
94 # bah - ugly global | |
95 uris = {} | |
96 | |
97 | |
98 def toProperty(label): | |
99 """ | |
100 CamelCase + lowercase inital a string | |
101 | |
102 | |
103 FIRST_NM => firstNm | |
104 | |
105 firstNm => firstNm | |
106 | |
107 """ | |
108 label = re.sub("[^\w]", " ", label) | |
109 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) | |
110 label = label.split(" ") | |
111 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) | |
112 | |
113 | |
114 def toPropertyLabel(label): | |
115 if not label[1:2].isupper(): | |
116 return label[0:1].lower() + label[1:] | |
117 return label | |
118 | |
119 | |
120 def index(l, i): | |
121 """return a set of indexes from a list | |
122 >>> index([1,2,3],(0,2)) | |
123 (1, 3) | |
124 """ | |
125 return tuple([l[x] for x in i]) | |
126 | |
127 | |
128 def csv_reader(csv_data, dialect=csv.excel, **kwargs): | |
129 | |
130 csv_reader = csv.reader(csv_data, | |
131 dialect=dialect, **kwargs) | |
132 for row in csv_reader: | |
133 # decode UTF-8 back to Unicode, cell by cell: | |
134 yield [text_type(cell, 'utf-8', errors='replace') for cell in row] | |
135 | |
136 | |
137 def prefixuri(x, prefix, class_=None): | |
138 if prefix: | |
139 r = rdflib.URIRef( | |
140 prefix + quote( | |
141 x.encode("utf8").replace(" ", "_"), safe="")) | |
142 else: | |
143 r = rdflib.URIRef(x) | |
144 uris[x] = (r, class_) | |
145 return r | |
146 | |
147 # meta-language for config | |
148 | |
149 | |
150 class NodeMaker(object): | |
151 def range(self): | |
152 return rdflib.RDFS.Literal | |
153 | |
154 def __call__(self, x): | |
155 return rdflib.Literal(x) | |
156 | |
157 | |
158 class NodeUri(NodeMaker): | |
159 def __init__(self, prefix, class_): | |
160 self.prefix = prefix | |
161 if class_: | |
162 self.class_ = rdflib.URIRef(class_) | |
163 else: | |
164 self.class_ = None | |
165 | |
166 def __call__(self, x): | |
167 return prefixuri(x, self.prefix, self.class_) | |
168 | |
169 def range(self): | |
170 return self.class_ or rdflib.RDF.Resource | |
171 | |
172 | |
173 class NodeLiteral(NodeMaker): | |
174 def __init__(self, f=None): | |
175 self.f = f | |
176 | |
177 | |
178 class NodeFloat(NodeLiteral): | |
179 def __call__(self, x): | |
180 if not self.f: | |
181 return rdflib.Literal(float(x)) | |
182 if callable(self.f): | |
183 return rdflib.Literal(float(self.f(x))) | |
184 raise Exception("Function passed to float is not callable") | |
185 | |
186 def range(self): | |
187 return rdflib.XSD.double | |
188 | |
189 | |
190 class NodeInt(NodeLiteral): | |
191 def __call__(self, x): | |
192 if not self.f: | |
193 return rdflib.Literal(int(x)) | |
194 if callable(self.f): | |
195 return rdflib.Literal(int(self.f(x))) | |
196 raise Exception("Function passed to int is not callable") | |
197 | |
198 def range(self): | |
199 return rdflib.XSD.int | |
200 | |
201 | |
202 class NodeBool(NodeLiteral): | |
203 def __call__(self, x): | |
204 if not self.f: | |
205 return rdflib.Literal(bool(x)) | |
206 if callable(self.f): | |
207 return rdflib.Literal(bool(self.f(x))) | |
208 raise Exception("Function passed to bool is not callable") | |
209 | |
210 def range(self): | |
211 return rdflib.XSD.bool | |
212 | |
213 | |
214 class NodeReplace(NodeMaker): | |
215 def __init__(self, a, b): | |
216 self.a = a | |
217 self.b = b | |
218 | |
219 def __call__(self, x): | |
220 return x.replace(self.a, self.b) | |
221 | |
222 | |
223 class NodeDate(NodeLiteral): | |
224 def __call__(self, x): | |
225 return rdflib.Literal(datetime.datetime.strptime(x, self.f)) | |
226 | |
227 def range(self): | |
228 return rdflib.XSD.dateTime | |
229 | |
230 | |
231 class NodeSplit(NodeMaker): | |
232 def __init__(self, sep, f): | |
233 self.sep = sep | |
234 self.f = f | |
235 | |
236 def __call__(self, x): | |
237 if not self.f: | |
238 self.f = rdflib.Literal | |
239 if not callable(self.f): | |
240 raise Exception("Function passed to split is not callable!") | |
241 return [ | |
242 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] | |
243 | |
244 def range(self): | |
245 if self.f and isinstance(self.f, NodeMaker): | |
246 return self.f.range() | |
247 return NodeMaker.range(self) | |
248 | |
249 | |
250 default_node_make = NodeMaker() | |
251 | |
252 | |
253 def _config_ignore(*args, **kwargs): | |
254 return "ignore" | |
255 | |
256 | |
257 def _config_uri(prefix=None, class_=None): | |
258 return NodeUri(prefix, class_) | |
259 | |
260 | |
261 def _config_literal(): | |
262 return NodeLiteral() | |
263 | |
264 | |
265 def _config_float(f=None): | |
266 return NodeFloat(f) | |
267 | |
268 | |
269 def _config_replace(a, b): | |
270 return NodeReplace(a, b) | |
271 | |
272 | |
273 def _config_int(f=None): | |
274 return NodeInt(f) | |
275 | |
276 | |
277 def _config_bool(f=None): | |
278 return NodeBool(f) | |
279 | |
280 | |
281 def _config_date(format_): | |
282 return NodeDate(format_) | |
283 | |
284 | |
285 def _config_split(sep=None, f=None): | |
286 return NodeSplit(sep, f) | |
287 | |
288 | |
289 config_functions = {"ignore": _config_ignore, | |
290 "uri": _config_uri, | |
291 "literal": _config_literal, | |
292 "float": _config_float, | |
293 "int": _config_int, | |
294 "date": _config_date, | |
295 "split": _config_split, | |
296 "replace": _config_replace, | |
297 "bool": _config_bool, | |
298 } | |
299 | |
300 | |
301 def column(v): | |
302 """Return a function for column mapping""" | |
303 | |
304 return eval(v, config_functions) | |
305 | |
306 | |
307 class CSV2RDF(object): | |
308 def __init__(self): | |
309 | |
310 self.CLASS = None | |
311 self.BASE = None | |
312 self.PROPBASE = None | |
313 self.IDENT = 'auto' | |
314 self.LABEL = None | |
315 self.DEFINECLASS = False | |
316 self.SKIP = 0 | |
317 self.DELIM = "," | |
318 self.DEFAULT = None | |
319 | |
320 self.COLUMNS = {} | |
321 self.PROPS = {} | |
322 | |
323 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') | |
324 | |
325 self.triples = 0 | |
326 | |
327 def triple(self, s, p, o): | |
328 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) | |
329 self.triples += 1 | |
330 | |
331 def convert(self, csvreader): | |
332 | |
333 start = time.time() | |
334 | |
335 if self.OUT: | |
336 sys.stderr.write("Output to %s\n" % self.OUT.name) | |
337 | |
338 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): | |
339 self.IDENT = (self.IDENT,) | |
340 | |
341 if not self.BASE: | |
342 warnings.warn("No base given, using http://example.org/instances/") | |
343 self.BASE = rdflib.Namespace("http://example.org/instances/") | |
344 | |
345 if not self.PROPBASE: | |
346 warnings.warn( | |
347 "No property base given, using http://example.org/property/") | |
348 self.PROPBASE = rdflib.Namespace("http://example.org/props/") | |
349 | |
350 # skip lines at the start | |
351 for x in range(self.SKIP): | |
352 next(csvreader) | |
353 | |
354 # read header line | |
355 header_labels = list(csvreader.next()) | |
356 headers = dict( | |
357 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) | |
358 # override header properties if some are given | |
359 for k, v in self.PROPS.items(): | |
360 headers[k] = v | |
361 header_labels[k] = split_uri(v)[1] | |
362 | |
363 if self.DEFINECLASS: | |
364 # output class/property definitions | |
365 self.triple(self.CLASS, RDF.type, RDFS.Class) | |
366 for i in range(len(headers)): | |
367 h, l = headers[i], header_labels[i] | |
368 if h == "" or l == "": | |
369 continue | |
370 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore': | |
371 continue | |
372 self.triple(h, RDF.type, RDF.Property) | |
373 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) | |
374 self.triple(h, RDFS.domain, self.CLASS) | |
375 self.triple(h, RDFS.range, | |
376 self.COLUMNS.get(i, default_node_make).range()) | |
377 | |
378 rows = 0 | |
379 for l in csvreader: | |
380 try: | |
381 if self.IDENT == 'auto': | |
382 uri = self.BASE["%d" % rows] | |
383 else: | |
384 uri = self.BASE["_".join([quote(x.encode( | |
385 "utf8").replace(" ", "_"), safe="") | |
386 for x in index(l, self.IDENT)])] | |
387 | |
388 if self.LABEL: | |
389 self.triple(uri, RDFS.label, rdflib.Literal( | |
390 " ".join(index(l, self.LABEL)))) | |
391 | |
392 if self.CLASS: | |
393 # type triple | |
394 self.triple(uri, RDF.type, self.CLASS) | |
395 | |
396 for i, x in enumerate(l): | |
397 x = x.strip() | |
398 if x != '': | |
399 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore': | |
400 continue | |
401 try: | |
402 o = self.COLUMNS.get(i, rdflib.Literal)(x) | |
403 if isinstance(o, list): | |
404 for _o in o: | |
405 self.triple(uri, headers[i], _o) | |
406 else: | |
407 self.triple(uri, headers[i], o) | |
408 | |
409 except Exception as e: | |
410 warnings.warn( | |
411 "Could not process value for column " + | |
412 "%d:%s in row %d, ignoring: %s " % ( | |
413 i, headers[i], rows, e.message)) | |
414 | |
415 rows += 1 | |
416 if rows % 100000 == 0: | |
417 sys.stderr.write( | |
418 "%d rows, %d triples, elapsed %.2fs.\n" % ( | |
419 rows, self.triples, time.time() - start)) | |
420 except: | |
421 sys.stderr.write("Error processing line: %d\n" % rows) | |
422 raise | |
423 | |
424 # output types/labels for generated URIs | |
425 classes = set() | |
426 for l, x in uris.items(): | |
427 u, c = x | |
428 self.triple(u, RDFS.label, rdflib.Literal(l)) | |
429 if c: | |
430 c = rdflib.URIRef(c) | |
431 classes.add(c) | |
432 self.triple(u, RDF.type, c) | |
433 | |
434 for c in classes: | |
435 self.triple(c, RDF.type, RDFS.Class) | |
436 | |
437 self.OUT.close() | |
438 sys.stderr.write( | |
439 "Converted %d rows into %d triples.\n" % (rows, self.triples)) | |
440 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) | |
441 | |
442 | |
443 def main(): | |
444 csv2rdf = CSV2RDF() | |
445 | |
446 opts, files = getopt.getopt( | |
447 sys.argv[1:], | |
448 "hc:b:p:i:o:Cf:l:s:d:D:", | |
449 ["out=", "base=", "delim=", "propbase=", "class=", "default=" | |
450 "ident=", "label=", "skip=", "defineclass", "help"]) | |
451 opts = dict(opts) | |
452 | |
453 if "-h" in opts or "--help" in opts: | |
454 print(HELP) | |
455 sys.exit(-1) | |
456 | |
457 if "-f" in opts: | |
458 config = configparser.ConfigParser() | |
459 config.readfp(open(opts["-f"])) | |
460 for k, v in config.items("csv2rdf"): | |
461 if k == "out": | |
462 csv2rdf.OUT = codecs.open(v, "w", "utf-8") | |
463 elif k == "base": | |
464 csv2rdf.BASE = rdflib.Namespace(v) | |
465 elif k == "propbase": | |
466 csv2rdf.PROPBASE = rdflib.Namespace(v) | |
467 elif k == "class": | |
468 csv2rdf.CLASS = rdflib.URIRef(v) | |
469 elif k == "defineclass": | |
470 csv2rdf.DEFINECLASS = bool(v) | |
471 elif k == "ident": | |
472 csv2rdf.IDENT = eval(v) | |
473 elif k == "label": | |
474 csv2rdf.LABEL = eval(v) | |
475 elif k == "delim": | |
476 csv2rdf.DELIM = v | |
477 elif k == "skip": | |
478 csv2rdf.SKIP = int(v) | |
479 elif k == "default": | |
480 csv2rdf.DEFAULT = column(v) | |
481 elif k.startswith("col"): | |
482 csv2rdf.COLUMNS[int(k[3:])] = column(v) | |
483 elif k.startswith("prop"): | |
484 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) | |
485 | |
486 if "-o" in opts: | |
487 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") | |
488 if "--out" in opts: | |
489 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") | |
490 | |
491 if "-b" in opts: | |
492 csv2rdf.BASE = rdflib.Namespace(opts["-b"]) | |
493 if "--base" in opts: | |
494 csv2rdf.BASE = rdflib.Namespace(opts["--base"]) | |
495 | |
496 if "-d" in opts: | |
497 csv2rdf.DELIM = opts["-d"] | |
498 if "--delim" in opts: | |
499 csv2rdf.DELIM = opts["--delim"] | |
500 | |
501 if "-D" in opts: | |
502 csv2rdf.DEFAULT = column(opts["-D"]) | |
503 if "--default" in opts: | |
504 csv2rdf.DEFAULT = column(opts["--default"]) | |
505 | |
506 if "-p" in opts: | |
507 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) | |
508 if "--propbase" in opts: | |
509 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) | |
510 | |
511 if "-l" in opts: | |
512 csv2rdf.LABEL = eval(opts["-l"]) | |
513 if "--label" in opts: | |
514 csv2rdf.LABEL = eval(opts["--label"]) | |
515 | |
516 if "-i" in opts: | |
517 csv2rdf.IDENT = eval(opts["-i"]) | |
518 if "--ident" in opts: | |
519 csv2rdf.IDENT = eval(opts["--ident"]) | |
520 | |
521 if "-s" in opts: | |
522 csv2rdf.SKIP = int(opts["-s"]) | |
523 if "--skip" in opts: | |
524 csv2rdf.SKIP = int(opts["--skip"]) | |
525 | |
526 if "-c" in opts: | |
527 csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) | |
528 if "--class" in opts: | |
529 csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) | |
530 | |
531 for k, v in opts.items(): | |
532 if k.startswith("--col"): | |
533 csv2rdf.COLUMNS[int(k[5:])] = column(v) | |
534 elif k.startswith("--prop"): | |
535 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) | |
536 | |
537 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): | |
538 csv2rdf.DEFINECLASS = True | |
539 | |
540 csv2rdf.convert( | |
541 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) | |
542 | |
543 | |
544 if __name__ == '__main__': | |
545 main() |