Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/rdflib/tools/csv2rdf.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac | 
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:4f3585e2f14b | 
|---|---|
| 1 """ | |
| 2 A commandline tool for semi-automatically converting CSV to RDF | |
| 3 | |
| 4 try: ``csv2rdf --help`` | |
| 5 | |
| 6 """ | |
| 7 | |
| 8 from __future__ import print_function | |
| 9 | |
| 10 import sys | |
| 11 import re | |
| 12 import csv | |
| 13 import getopt | |
| 14 import fileinput | |
| 15 import codecs | |
| 16 import time | |
| 17 import datetime | |
| 18 import warnings | |
| 19 | |
| 20 | |
| 21 import rdflib | |
| 22 | |
| 23 from six.moves import configparser | |
| 24 from six.moves.urllib.parse import quote | |
| 25 from six import text_type | |
| 26 | |
| 27 from rdflib import RDF, RDFS | |
| 28 from rdflib.namespace import split_uri | |
| 29 | |
| 30 __all__ = ['CSV2RDF'] | |
| 31 | |
| 32 HELP = """ | |
| 33 csv2rdf.py \ | |
| 34 -b <instance-base> \ | |
| 35 -p <property-base> \ | |
| 36 [-D <default>] \ | |
| 37 [-c <classname>] \ | |
| 38 [-i <identity column(s)>] \ | |
| 39 [-l <label columns>] \ | |
| 40 [-s <N>] [-o <output>] \ | |
| 41 [-f configfile] \ | |
| 42 [--col<N> <colspec>] \ | |
| 43 [--prop<N> <property>] \ | |
| 44 <[-d <delim>] \ | |
| 45 [-C] [files...]" | |
| 46 | |
| 47 Reads csv files from stdin or given files | |
| 48 if -d is given, use this delimiter | |
| 49 if -s is given, skips N lines at the start | |
| 50 Creates a URI from the columns given to -i, or automatically by numbering if | |
| 51 none is given | |
| 52 Outputs RDFS labels from the columns given to -l | |
| 53 if -c is given adds a type triple with the given classname | |
| 54 if -C is given, the class is defined as rdfs:Class | |
| 55 Outputs one RDF triple per column in each row. | |
| 56 Output is in n3 format. | |
| 57 Output is stdout, unless -o is specified | |
| 58 | |
| 59 Long options also supported: \ | |
| 60 --base, \ | |
| 61 --propbase, \ | |
| 62 --ident, \ | |
| 63 --class, \ | |
| 64 --label, \ | |
| 65 --out, \ | |
| 66 --defineclass | |
| 67 | |
| 68 Long options --col0, --col1, ... | |
| 69 can be used to specify conversion for columns. | |
| 70 Conversions can be: | |
| 71 ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format) | |
| 72 | |
| 73 Long options --prop0, --prop1, ... | |
| 74 can be used to use specific properties, rather than ones auto-generated | |
| 75 from the headers | |
| 76 | |
| 77 -D sets the default conversion for columns not listed | |
| 78 | |
| 79 -f says to read config from a .ini/config file - the file must contain one | |
| 80 section called csv2rdf, with keys like the long options, i.e.: | |
| 81 | |
| 82 [csv2rdf] | |
| 83 out=output.n3 | |
| 84 base=http://example.org/ | |
| 85 col0=split(";") | |
| 86 col1=split(";", uri("http://example.org/things/", | |
| 87 "http://xmlns.com/foaf/0.1/Person")) | |
| 88 col2=float() | |
| 89 col3=int() | |
| 90 col4=date("%Y-%b-%d %H:%M:%S") | |
| 91 | |
| 92 """ | |
| 93 | |
| 94 # bah - ugly global | |
| 95 uris = {} | |
| 96 | |
| 97 | |
| 98 def toProperty(label): | |
| 99 """ | |
| 100 CamelCase + lowercase inital a string | |
| 101 | |
| 102 | |
| 103 FIRST_NM => firstNm | |
| 104 | |
| 105 firstNm => firstNm | |
| 106 | |
| 107 """ | |
| 108 label = re.sub("[^\w]", " ", label) | |
| 109 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) | |
| 110 label = label.split(" ") | |
| 111 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) | |
| 112 | |
| 113 | |
| 114 def toPropertyLabel(label): | |
| 115 if not label[1:2].isupper(): | |
| 116 return label[0:1].lower() + label[1:] | |
| 117 return label | |
| 118 | |
| 119 | |
| 120 def index(l, i): | |
| 121 """return a set of indexes from a list | |
| 122 >>> index([1,2,3],(0,2)) | |
| 123 (1, 3) | |
| 124 """ | |
| 125 return tuple([l[x] for x in i]) | |
| 126 | |
| 127 | |
| 128 def csv_reader(csv_data, dialect=csv.excel, **kwargs): | |
| 129 | |
| 130 csv_reader = csv.reader(csv_data, | |
| 131 dialect=dialect, **kwargs) | |
| 132 for row in csv_reader: | |
| 133 # decode UTF-8 back to Unicode, cell by cell: | |
| 134 yield [text_type(cell, 'utf-8', errors='replace') for cell in row] | |
| 135 | |
| 136 | |
| 137 def prefixuri(x, prefix, class_=None): | |
| 138 if prefix: | |
| 139 r = rdflib.URIRef( | |
| 140 prefix + quote( | |
| 141 x.encode("utf8").replace(" ", "_"), safe="")) | |
| 142 else: | |
| 143 r = rdflib.URIRef(x) | |
| 144 uris[x] = (r, class_) | |
| 145 return r | |
| 146 | |
| 147 # meta-language for config | |
| 148 | |
| 149 | |
| 150 class NodeMaker(object): | |
| 151 def range(self): | |
| 152 return rdflib.RDFS.Literal | |
| 153 | |
| 154 def __call__(self, x): | |
| 155 return rdflib.Literal(x) | |
| 156 | |
| 157 | |
| 158 class NodeUri(NodeMaker): | |
| 159 def __init__(self, prefix, class_): | |
| 160 self.prefix = prefix | |
| 161 if class_: | |
| 162 self.class_ = rdflib.URIRef(class_) | |
| 163 else: | |
| 164 self.class_ = None | |
| 165 | |
| 166 def __call__(self, x): | |
| 167 return prefixuri(x, self.prefix, self.class_) | |
| 168 | |
| 169 def range(self): | |
| 170 return self.class_ or rdflib.RDF.Resource | |
| 171 | |
| 172 | |
| 173 class NodeLiteral(NodeMaker): | |
| 174 def __init__(self, f=None): | |
| 175 self.f = f | |
| 176 | |
| 177 | |
| 178 class NodeFloat(NodeLiteral): | |
| 179 def __call__(self, x): | |
| 180 if not self.f: | |
| 181 return rdflib.Literal(float(x)) | |
| 182 if callable(self.f): | |
| 183 return rdflib.Literal(float(self.f(x))) | |
| 184 raise Exception("Function passed to float is not callable") | |
| 185 | |
| 186 def range(self): | |
| 187 return rdflib.XSD.double | |
| 188 | |
| 189 | |
| 190 class NodeInt(NodeLiteral): | |
| 191 def __call__(self, x): | |
| 192 if not self.f: | |
| 193 return rdflib.Literal(int(x)) | |
| 194 if callable(self.f): | |
| 195 return rdflib.Literal(int(self.f(x))) | |
| 196 raise Exception("Function passed to int is not callable") | |
| 197 | |
| 198 def range(self): | |
| 199 return rdflib.XSD.int | |
| 200 | |
| 201 | |
| 202 class NodeBool(NodeLiteral): | |
| 203 def __call__(self, x): | |
| 204 if not self.f: | |
| 205 return rdflib.Literal(bool(x)) | |
| 206 if callable(self.f): | |
| 207 return rdflib.Literal(bool(self.f(x))) | |
| 208 raise Exception("Function passed to bool is not callable") | |
| 209 | |
| 210 def range(self): | |
| 211 return rdflib.XSD.bool | |
| 212 | |
| 213 | |
| 214 class NodeReplace(NodeMaker): | |
| 215 def __init__(self, a, b): | |
| 216 self.a = a | |
| 217 self.b = b | |
| 218 | |
| 219 def __call__(self, x): | |
| 220 return x.replace(self.a, self.b) | |
| 221 | |
| 222 | |
| 223 class NodeDate(NodeLiteral): | |
| 224 def __call__(self, x): | |
| 225 return rdflib.Literal(datetime.datetime.strptime(x, self.f)) | |
| 226 | |
| 227 def range(self): | |
| 228 return rdflib.XSD.dateTime | |
| 229 | |
| 230 | |
| 231 class NodeSplit(NodeMaker): | |
| 232 def __init__(self, sep, f): | |
| 233 self.sep = sep | |
| 234 self.f = f | |
| 235 | |
| 236 def __call__(self, x): | |
| 237 if not self.f: | |
| 238 self.f = rdflib.Literal | |
| 239 if not callable(self.f): | |
| 240 raise Exception("Function passed to split is not callable!") | |
| 241 return [ | |
| 242 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] | |
| 243 | |
| 244 def range(self): | |
| 245 if self.f and isinstance(self.f, NodeMaker): | |
| 246 return self.f.range() | |
| 247 return NodeMaker.range(self) | |
| 248 | |
| 249 | |
| 250 default_node_make = NodeMaker() | |
| 251 | |
| 252 | |
| 253 def _config_ignore(*args, **kwargs): | |
| 254 return "ignore" | |
| 255 | |
| 256 | |
| 257 def _config_uri(prefix=None, class_=None): | |
| 258 return NodeUri(prefix, class_) | |
| 259 | |
| 260 | |
| 261 def _config_literal(): | |
| 262 return NodeLiteral() | |
| 263 | |
| 264 | |
| 265 def _config_float(f=None): | |
| 266 return NodeFloat(f) | |
| 267 | |
| 268 | |
| 269 def _config_replace(a, b): | |
| 270 return NodeReplace(a, b) | |
| 271 | |
| 272 | |
| 273 def _config_int(f=None): | |
| 274 return NodeInt(f) | |
| 275 | |
| 276 | |
| 277 def _config_bool(f=None): | |
| 278 return NodeBool(f) | |
| 279 | |
| 280 | |
| 281 def _config_date(format_): | |
| 282 return NodeDate(format_) | |
| 283 | |
| 284 | |
| 285 def _config_split(sep=None, f=None): | |
| 286 return NodeSplit(sep, f) | |
| 287 | |
| 288 | |
| 289 config_functions = {"ignore": _config_ignore, | |
| 290 "uri": _config_uri, | |
| 291 "literal": _config_literal, | |
| 292 "float": _config_float, | |
| 293 "int": _config_int, | |
| 294 "date": _config_date, | |
| 295 "split": _config_split, | |
| 296 "replace": _config_replace, | |
| 297 "bool": _config_bool, | |
| 298 } | |
| 299 | |
| 300 | |
| 301 def column(v): | |
| 302 """Return a function for column mapping""" | |
| 303 | |
| 304 return eval(v, config_functions) | |
| 305 | |
| 306 | |
| 307 class CSV2RDF(object): | |
| 308 def __init__(self): | |
| 309 | |
| 310 self.CLASS = None | |
| 311 self.BASE = None | |
| 312 self.PROPBASE = None | |
| 313 self.IDENT = 'auto' | |
| 314 self.LABEL = None | |
| 315 self.DEFINECLASS = False | |
| 316 self.SKIP = 0 | |
| 317 self.DELIM = "," | |
| 318 self.DEFAULT = None | |
| 319 | |
| 320 self.COLUMNS = {} | |
| 321 self.PROPS = {} | |
| 322 | |
| 323 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') | |
| 324 | |
| 325 self.triples = 0 | |
| 326 | |
| 327 def triple(self, s, p, o): | |
| 328 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) | |
| 329 self.triples += 1 | |
| 330 | |
| 331 def convert(self, csvreader): | |
| 332 | |
| 333 start = time.time() | |
| 334 | |
| 335 if self.OUT: | |
| 336 sys.stderr.write("Output to %s\n" % self.OUT.name) | |
| 337 | |
| 338 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): | |
| 339 self.IDENT = (self.IDENT,) | |
| 340 | |
| 341 if not self.BASE: | |
| 342 warnings.warn("No base given, using http://example.org/instances/") | |
| 343 self.BASE = rdflib.Namespace("http://example.org/instances/") | |
| 344 | |
| 345 if not self.PROPBASE: | |
| 346 warnings.warn( | |
| 347 "No property base given, using http://example.org/property/") | |
| 348 self.PROPBASE = rdflib.Namespace("http://example.org/props/") | |
| 349 | |
| 350 # skip lines at the start | |
| 351 for x in range(self.SKIP): | |
| 352 next(csvreader) | |
| 353 | |
| 354 # read header line | |
| 355 header_labels = list(csvreader.next()) | |
| 356 headers = dict( | |
| 357 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) | |
| 358 # override header properties if some are given | |
| 359 for k, v in self.PROPS.items(): | |
| 360 headers[k] = v | |
| 361 header_labels[k] = split_uri(v)[1] | |
| 362 | |
| 363 if self.DEFINECLASS: | |
| 364 # output class/property definitions | |
| 365 self.triple(self.CLASS, RDF.type, RDFS.Class) | |
| 366 for i in range(len(headers)): | |
| 367 h, l = headers[i], header_labels[i] | |
| 368 if h == "" or l == "": | |
| 369 continue | |
| 370 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore': | |
| 371 continue | |
| 372 self.triple(h, RDF.type, RDF.Property) | |
| 373 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) | |
| 374 self.triple(h, RDFS.domain, self.CLASS) | |
| 375 self.triple(h, RDFS.range, | |
| 376 self.COLUMNS.get(i, default_node_make).range()) | |
| 377 | |
| 378 rows = 0 | |
| 379 for l in csvreader: | |
| 380 try: | |
| 381 if self.IDENT == 'auto': | |
| 382 uri = self.BASE["%d" % rows] | |
| 383 else: | |
| 384 uri = self.BASE["_".join([quote(x.encode( | |
| 385 "utf8").replace(" ", "_"), safe="") | |
| 386 for x in index(l, self.IDENT)])] | |
| 387 | |
| 388 if self.LABEL: | |
| 389 self.triple(uri, RDFS.label, rdflib.Literal( | |
| 390 " ".join(index(l, self.LABEL)))) | |
| 391 | |
| 392 if self.CLASS: | |
| 393 # type triple | |
| 394 self.triple(uri, RDF.type, self.CLASS) | |
| 395 | |
| 396 for i, x in enumerate(l): | |
| 397 x = x.strip() | |
| 398 if x != '': | |
| 399 if self.COLUMNS.get(i, self.DEFAULT) == 'ignore': | |
| 400 continue | |
| 401 try: | |
| 402 o = self.COLUMNS.get(i, rdflib.Literal)(x) | |
| 403 if isinstance(o, list): | |
| 404 for _o in o: | |
| 405 self.triple(uri, headers[i], _o) | |
| 406 else: | |
| 407 self.triple(uri, headers[i], o) | |
| 408 | |
| 409 except Exception as e: | |
| 410 warnings.warn( | |
| 411 "Could not process value for column " + | |
| 412 "%d:%s in row %d, ignoring: %s " % ( | |
| 413 i, headers[i], rows, e.message)) | |
| 414 | |
| 415 rows += 1 | |
| 416 if rows % 100000 == 0: | |
| 417 sys.stderr.write( | |
| 418 "%d rows, %d triples, elapsed %.2fs.\n" % ( | |
| 419 rows, self.triples, time.time() - start)) | |
| 420 except: | |
| 421 sys.stderr.write("Error processing line: %d\n" % rows) | |
| 422 raise | |
| 423 | |
| 424 # output types/labels for generated URIs | |
| 425 classes = set() | |
| 426 for l, x in uris.items(): | |
| 427 u, c = x | |
| 428 self.triple(u, RDFS.label, rdflib.Literal(l)) | |
| 429 if c: | |
| 430 c = rdflib.URIRef(c) | |
| 431 classes.add(c) | |
| 432 self.triple(u, RDF.type, c) | |
| 433 | |
| 434 for c in classes: | |
| 435 self.triple(c, RDF.type, RDFS.Class) | |
| 436 | |
| 437 self.OUT.close() | |
| 438 sys.stderr.write( | |
| 439 "Converted %d rows into %d triples.\n" % (rows, self.triples)) | |
| 440 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) | |
| 441 | |
| 442 | |
| 443 def main(): | |
| 444 csv2rdf = CSV2RDF() | |
| 445 | |
| 446 opts, files = getopt.getopt( | |
| 447 sys.argv[1:], | |
| 448 "hc:b:p:i:o:Cf:l:s:d:D:", | |
| 449 ["out=", "base=", "delim=", "propbase=", "class=", "default=" | |
| 450 "ident=", "label=", "skip=", "defineclass", "help"]) | |
| 451 opts = dict(opts) | |
| 452 | |
| 453 if "-h" in opts or "--help" in opts: | |
| 454 print(HELP) | |
| 455 sys.exit(-1) | |
| 456 | |
| 457 if "-f" in opts: | |
| 458 config = configparser.ConfigParser() | |
| 459 config.readfp(open(opts["-f"])) | |
| 460 for k, v in config.items("csv2rdf"): | |
| 461 if k == "out": | |
| 462 csv2rdf.OUT = codecs.open(v, "w", "utf-8") | |
| 463 elif k == "base": | |
| 464 csv2rdf.BASE = rdflib.Namespace(v) | |
| 465 elif k == "propbase": | |
| 466 csv2rdf.PROPBASE = rdflib.Namespace(v) | |
| 467 elif k == "class": | |
| 468 csv2rdf.CLASS = rdflib.URIRef(v) | |
| 469 elif k == "defineclass": | |
| 470 csv2rdf.DEFINECLASS = bool(v) | |
| 471 elif k == "ident": | |
| 472 csv2rdf.IDENT = eval(v) | |
| 473 elif k == "label": | |
| 474 csv2rdf.LABEL = eval(v) | |
| 475 elif k == "delim": | |
| 476 csv2rdf.DELIM = v | |
| 477 elif k == "skip": | |
| 478 csv2rdf.SKIP = int(v) | |
| 479 elif k == "default": | |
| 480 csv2rdf.DEFAULT = column(v) | |
| 481 elif k.startswith("col"): | |
| 482 csv2rdf.COLUMNS[int(k[3:])] = column(v) | |
| 483 elif k.startswith("prop"): | |
| 484 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) | |
| 485 | |
| 486 if "-o" in opts: | |
| 487 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") | |
| 488 if "--out" in opts: | |
| 489 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") | |
| 490 | |
| 491 if "-b" in opts: | |
| 492 csv2rdf.BASE = rdflib.Namespace(opts["-b"]) | |
| 493 if "--base" in opts: | |
| 494 csv2rdf.BASE = rdflib.Namespace(opts["--base"]) | |
| 495 | |
| 496 if "-d" in opts: | |
| 497 csv2rdf.DELIM = opts["-d"] | |
| 498 if "--delim" in opts: | |
| 499 csv2rdf.DELIM = opts["--delim"] | |
| 500 | |
| 501 if "-D" in opts: | |
| 502 csv2rdf.DEFAULT = column(opts["-D"]) | |
| 503 if "--default" in opts: | |
| 504 csv2rdf.DEFAULT = column(opts["--default"]) | |
| 505 | |
| 506 if "-p" in opts: | |
| 507 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) | |
| 508 if "--propbase" in opts: | |
| 509 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) | |
| 510 | |
| 511 if "-l" in opts: | |
| 512 csv2rdf.LABEL = eval(opts["-l"]) | |
| 513 if "--label" in opts: | |
| 514 csv2rdf.LABEL = eval(opts["--label"]) | |
| 515 | |
| 516 if "-i" in opts: | |
| 517 csv2rdf.IDENT = eval(opts["-i"]) | |
| 518 if "--ident" in opts: | |
| 519 csv2rdf.IDENT = eval(opts["--ident"]) | |
| 520 | |
| 521 if "-s" in opts: | |
| 522 csv2rdf.SKIP = int(opts["-s"]) | |
| 523 if "--skip" in opts: | |
| 524 csv2rdf.SKIP = int(opts["--skip"]) | |
| 525 | |
| 526 if "-c" in opts: | |
| 527 csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) | |
| 528 if "--class" in opts: | |
| 529 csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) | |
| 530 | |
| 531 for k, v in opts.items(): | |
| 532 if k.startswith("--col"): | |
| 533 csv2rdf.COLUMNS[int(k[5:])] = column(v) | |
| 534 elif k.startswith("--prop"): | |
| 535 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) | |
| 536 | |
| 537 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): | |
| 538 csv2rdf.DEFINECLASS = True | |
| 539 | |
| 540 csv2rdf.convert( | |
| 541 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) | |
| 542 | |
| 543 | |
| 544 if __name__ == '__main__': | |
| 545 main() | 
