comparison env/lib/python3.9/site-packages/rdflib/tools/csv2rdf.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """
2 A commandline tool for semi-automatically converting CSV to RDF
3
4 try: ``csv2rdf --help``
5
6 """
7
8 from __future__ import print_function
9
10 import sys
11 import re
12 import csv
13 import getopt
14 import fileinput
15 import codecs
16 import time
17 import datetime
18 import warnings
19
20
21 import rdflib
22
23 from six.moves import configparser
24 from six.moves.urllib.parse import quote
25 from six import text_type
26
27 from rdflib import RDF, RDFS
28 from rdflib.namespace import split_uri
29
30 __all__ = ['CSV2RDF']
31
32 HELP = """
33 csv2rdf.py \
34 -b <instance-base> \
35 -p <property-base> \
36 [-D <default>] \
37 [-c <classname>] \
38 [-i <identity column(s)>] \
39 [-l <label columns>] \
40 [-s <N>] [-o <output>] \
41 [-f configfile] \
42 [--col<N> <colspec>] \
43 [--prop<N> <property>] \
44 <[-d <delim>] \
45 [-C] [files...]"
46
47 Reads csv files from stdin or given files
48 if -d is given, use this delimiter
49 if -s is given, skips N lines at the start
50 Creates a URI from the columns given to -i, or automatically by numbering if
51 none is given
52 Outputs RDFS labels from the columns given to -l
53 if -c is given adds a type triple with the given classname
54 if -C is given, the class is defined as rdfs:Class
55 Outputs one RDF triple per column in each row.
56 Output is in n3 format.
57 Output is stdout, unless -o is specified
58
59 Long options also supported: \
60 --base, \
61 --propbase, \
62 --ident, \
63 --class, \
64 --label, \
65 --out, \
66 --defineclass
67
68 Long options --col0, --col1, ...
69 can be used to specify conversion for columns.
70 Conversions can be:
71 ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format)
72
73 Long options --prop0, --prop1, ...
74 can be used to use specific properties, rather than ones auto-generated
75 from the headers
76
77 -D sets the default conversion for columns not listed
78
79 -f says to read config from a .ini/config file - the file must contain one
80 section called csv2rdf, with keys like the long options, i.e.:
81
82 [csv2rdf]
83 out=output.n3
84 base=http://example.org/
85 col0=split(";")
86 col1=split(";", uri("http://example.org/things/",
87 "http://xmlns.com/foaf/0.1/Person"))
88 col2=float()
89 col3=int()
90 col4=date("%Y-%b-%d %H:%M:%S")
91
92 """
93
94 # bah - ugly global
95 uris = {}
96
97
98 def toProperty(label):
99 """
100 CamelCase + lowercase inital a string
101
102
103 FIRST_NM => firstNm
104
105 firstNm => firstNm
106
107 """
108 label = re.sub("[^\w]", " ", label)
109 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
110 label = label.split(" ")
111 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
112
113
114 def toPropertyLabel(label):
115 if not label[1:2].isupper():
116 return label[0:1].lower() + label[1:]
117 return label
118
119
120 def index(l, i):
121 """return a set of indexes from a list
122 >>> index([1,2,3],(0,2))
123 (1, 3)
124 """
125 return tuple([l[x] for x in i])
126
127
128 def csv_reader(csv_data, dialect=csv.excel, **kwargs):
129
130 csv_reader = csv.reader(csv_data,
131 dialect=dialect, **kwargs)
132 for row in csv_reader:
133 # decode UTF-8 back to Unicode, cell by cell:
134 yield [text_type(cell, 'utf-8', errors='replace') for cell in row]
135
136
137 def prefixuri(x, prefix, class_=None):
138 if prefix:
139 r = rdflib.URIRef(
140 prefix + quote(
141 x.encode("utf8").replace(" ", "_"), safe=""))
142 else:
143 r = rdflib.URIRef(x)
144 uris[x] = (r, class_)
145 return r
146
147 # meta-language for config
148
149
150 class NodeMaker(object):
151 def range(self):
152 return rdflib.RDFS.Literal
153
154 def __call__(self, x):
155 return rdflib.Literal(x)
156
157
158 class NodeUri(NodeMaker):
159 def __init__(self, prefix, class_):
160 self.prefix = prefix
161 if class_:
162 self.class_ = rdflib.URIRef(class_)
163 else:
164 self.class_ = None
165
166 def __call__(self, x):
167 return prefixuri(x, self.prefix, self.class_)
168
169 def range(self):
170 return self.class_ or rdflib.RDF.Resource
171
172
173 class NodeLiteral(NodeMaker):
174 def __init__(self, f=None):
175 self.f = f
176
177
178 class NodeFloat(NodeLiteral):
179 def __call__(self, x):
180 if not self.f:
181 return rdflib.Literal(float(x))
182 if callable(self.f):
183 return rdflib.Literal(float(self.f(x)))
184 raise Exception("Function passed to float is not callable")
185
186 def range(self):
187 return rdflib.XSD.double
188
189
190 class NodeInt(NodeLiteral):
191 def __call__(self, x):
192 if not self.f:
193 return rdflib.Literal(int(x))
194 if callable(self.f):
195 return rdflib.Literal(int(self.f(x)))
196 raise Exception("Function passed to int is not callable")
197
198 def range(self):
199 return rdflib.XSD.int
200
201
202 class NodeBool(NodeLiteral):
203 def __call__(self, x):
204 if not self.f:
205 return rdflib.Literal(bool(x))
206 if callable(self.f):
207 return rdflib.Literal(bool(self.f(x)))
208 raise Exception("Function passed to bool is not callable")
209
210 def range(self):
211 return rdflib.XSD.bool
212
213
214 class NodeReplace(NodeMaker):
215 def __init__(self, a, b):
216 self.a = a
217 self.b = b
218
219 def __call__(self, x):
220 return x.replace(self.a, self.b)
221
222
223 class NodeDate(NodeLiteral):
224 def __call__(self, x):
225 return rdflib.Literal(datetime.datetime.strptime(x, self.f))
226
227 def range(self):
228 return rdflib.XSD.dateTime
229
230
231 class NodeSplit(NodeMaker):
232 def __init__(self, sep, f):
233 self.sep = sep
234 self.f = f
235
236 def __call__(self, x):
237 if not self.f:
238 self.f = rdflib.Literal
239 if not callable(self.f):
240 raise Exception("Function passed to split is not callable!")
241 return [
242 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
243
244 def range(self):
245 if self.f and isinstance(self.f, NodeMaker):
246 return self.f.range()
247 return NodeMaker.range(self)
248
249
250 default_node_make = NodeMaker()
251
252
253 def _config_ignore(*args, **kwargs):
254 return "ignore"
255
256
257 def _config_uri(prefix=None, class_=None):
258 return NodeUri(prefix, class_)
259
260
261 def _config_literal():
262 return NodeLiteral()
263
264
265 def _config_float(f=None):
266 return NodeFloat(f)
267
268
269 def _config_replace(a, b):
270 return NodeReplace(a, b)
271
272
273 def _config_int(f=None):
274 return NodeInt(f)
275
276
277 def _config_bool(f=None):
278 return NodeBool(f)
279
280
281 def _config_date(format_):
282 return NodeDate(format_)
283
284
285 def _config_split(sep=None, f=None):
286 return NodeSplit(sep, f)
287
288
289 config_functions = {"ignore": _config_ignore,
290 "uri": _config_uri,
291 "literal": _config_literal,
292 "float": _config_float,
293 "int": _config_int,
294 "date": _config_date,
295 "split": _config_split,
296 "replace": _config_replace,
297 "bool": _config_bool,
298 }
299
300
301 def column(v):
302 """Return a function for column mapping"""
303
304 return eval(v, config_functions)
305
306
307 class CSV2RDF(object):
308 def __init__(self):
309
310 self.CLASS = None
311 self.BASE = None
312 self.PROPBASE = None
313 self.IDENT = 'auto'
314 self.LABEL = None
315 self.DEFINECLASS = False
316 self.SKIP = 0
317 self.DELIM = ","
318 self.DEFAULT = None
319
320 self.COLUMNS = {}
321 self.PROPS = {}
322
323 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace')
324
325 self.triples = 0
326
327 def triple(self, s, p, o):
328 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
329 self.triples += 1
330
331 def convert(self, csvreader):
332
333 start = time.time()
334
335 if self.OUT:
336 sys.stderr.write("Output to %s\n" % self.OUT.name)
337
338 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
339 self.IDENT = (self.IDENT,)
340
341 if not self.BASE:
342 warnings.warn("No base given, using http://example.org/instances/")
343 self.BASE = rdflib.Namespace("http://example.org/instances/")
344
345 if not self.PROPBASE:
346 warnings.warn(
347 "No property base given, using http://example.org/property/")
348 self.PROPBASE = rdflib.Namespace("http://example.org/props/")
349
350 # skip lines at the start
351 for x in range(self.SKIP):
352 next(csvreader)
353
354 # read header line
355 header_labels = list(csvreader.next())
356 headers = dict(
357 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
358 # override header properties if some are given
359 for k, v in self.PROPS.items():
360 headers[k] = v
361 header_labels[k] = split_uri(v)[1]
362
363 if self.DEFINECLASS:
364 # output class/property definitions
365 self.triple(self.CLASS, RDF.type, RDFS.Class)
366 for i in range(len(headers)):
367 h, l = headers[i], header_labels[i]
368 if h == "" or l == "":
369 continue
370 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore':
371 continue
372 self.triple(h, RDF.type, RDF.Property)
373 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l)))
374 self.triple(h, RDFS.domain, self.CLASS)
375 self.triple(h, RDFS.range,
376 self.COLUMNS.get(i, default_node_make).range())
377
378 rows = 0
379 for l in csvreader:
380 try:
381 if self.IDENT == 'auto':
382 uri = self.BASE["%d" % rows]
383 else:
384 uri = self.BASE["_".join([quote(x.encode(
385 "utf8").replace(" ", "_"), safe="")
386 for x in index(l, self.IDENT)])]
387
388 if self.LABEL:
389 self.triple(uri, RDFS.label, rdflib.Literal(
390 " ".join(index(l, self.LABEL))))
391
392 if self.CLASS:
393 # type triple
394 self.triple(uri, RDF.type, self.CLASS)
395
396 for i, x in enumerate(l):
397 x = x.strip()
398 if x != '':
399 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore':
400 continue
401 try:
402 o = self.COLUMNS.get(i, rdflib.Literal)(x)
403 if isinstance(o, list):
404 for _o in o:
405 self.triple(uri, headers[i], _o)
406 else:
407 self.triple(uri, headers[i], o)
408
409 except Exception as e:
410 warnings.warn(
411 "Could not process value for column " +
412 "%d:%s in row %d, ignoring: %s " % (
413 i, headers[i], rows, e.message))
414
415 rows += 1
416 if rows % 100000 == 0:
417 sys.stderr.write(
418 "%d rows, %d triples, elapsed %.2fs.\n" % (
419 rows, self.triples, time.time() - start))
420 except:
421 sys.stderr.write("Error processing line: %d\n" % rows)
422 raise
423
424 # output types/labels for generated URIs
425 classes = set()
426 for l, x in uris.items():
427 u, c = x
428 self.triple(u, RDFS.label, rdflib.Literal(l))
429 if c:
430 c = rdflib.URIRef(c)
431 classes.add(c)
432 self.triple(u, RDF.type, c)
433
434 for c in classes:
435 self.triple(c, RDF.type, RDFS.Class)
436
437 self.OUT.close()
438 sys.stderr.write(
439 "Converted %d rows into %d triples.\n" % (rows, self.triples))
440 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
441
442
443 def main():
444 csv2rdf = CSV2RDF()
445
446 opts, files = getopt.getopt(
447 sys.argv[1:],
448 "hc:b:p:i:o:Cf:l:s:d:D:",
449 ["out=", "base=", "delim=", "propbase=", "class=", "default="
450 "ident=", "label=", "skip=", "defineclass", "help"])
451 opts = dict(opts)
452
453 if "-h" in opts or "--help" in opts:
454 print(HELP)
455 sys.exit(-1)
456
457 if "-f" in opts:
458 config = configparser.ConfigParser()
459 config.readfp(open(opts["-f"]))
460 for k, v in config.items("csv2rdf"):
461 if k == "out":
462 csv2rdf.OUT = codecs.open(v, "w", "utf-8")
463 elif k == "base":
464 csv2rdf.BASE = rdflib.Namespace(v)
465 elif k == "propbase":
466 csv2rdf.PROPBASE = rdflib.Namespace(v)
467 elif k == "class":
468 csv2rdf.CLASS = rdflib.URIRef(v)
469 elif k == "defineclass":
470 csv2rdf.DEFINECLASS = bool(v)
471 elif k == "ident":
472 csv2rdf.IDENT = eval(v)
473 elif k == "label":
474 csv2rdf.LABEL = eval(v)
475 elif k == "delim":
476 csv2rdf.DELIM = v
477 elif k == "skip":
478 csv2rdf.SKIP = int(v)
479 elif k == "default":
480 csv2rdf.DEFAULT = column(v)
481 elif k.startswith("col"):
482 csv2rdf.COLUMNS[int(k[3:])] = column(v)
483 elif k.startswith("prop"):
484 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
485
486 if "-o" in opts:
487 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
488 if "--out" in opts:
489 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
490
491 if "-b" in opts:
492 csv2rdf.BASE = rdflib.Namespace(opts["-b"])
493 if "--base" in opts:
494 csv2rdf.BASE = rdflib.Namespace(opts["--base"])
495
496 if "-d" in opts:
497 csv2rdf.DELIM = opts["-d"]
498 if "--delim" in opts:
499 csv2rdf.DELIM = opts["--delim"]
500
501 if "-D" in opts:
502 csv2rdf.DEFAULT = column(opts["-D"])
503 if "--default" in opts:
504 csv2rdf.DEFAULT = column(opts["--default"])
505
506 if "-p" in opts:
507 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
508 if "--propbase" in opts:
509 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
510
511 if "-l" in opts:
512 csv2rdf.LABEL = eval(opts["-l"])
513 if "--label" in opts:
514 csv2rdf.LABEL = eval(opts["--label"])
515
516 if "-i" in opts:
517 csv2rdf.IDENT = eval(opts["-i"])
518 if "--ident" in opts:
519 csv2rdf.IDENT = eval(opts["--ident"])
520
521 if "-s" in opts:
522 csv2rdf.SKIP = int(opts["-s"])
523 if "--skip" in opts:
524 csv2rdf.SKIP = int(opts["--skip"])
525
526 if "-c" in opts:
527 csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
528 if "--class" in opts:
529 csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
530
531 for k, v in opts.items():
532 if k.startswith("--col"):
533 csv2rdf.COLUMNS[int(k[5:])] = column(v)
534 elif k.startswith("--prop"):
535 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
536
537 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
538 csv2rdf.DEFINECLASS = True
539
540 csv2rdf.convert(
541 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
542
543
544 if __name__ == '__main__':
545 main()