Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
| author | shellac |
|---|---|
| date | Mon, 01 Jun 2020 08:59:25 -0400 |
| parents | 79f47841a781 |
| children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,511 +0,0 @@ -""" -A commandline tool for semi-automatically converting CSV to RDF - -try: ``csv2rdf --help`` - -""" - - -import sys -import re -import csv -import getopt -import configparser -import fileinput -import codecs -import time -import datetime -import warnings -import urllib.request, urllib.error, urllib.parse - -import rdflib - -from rdflib import RDF, RDFS -from rdflib.namespace import split_uri - -__all__ = [ 'CSV2RDF' ] - -HELP = """ -csv2rdf.py \ - -b <instance-base> \ - -p <property-base> \ - [-c <classname>] \ - [-i <identity column(s)>] \ - [-l <label columns>] \ - [-s <N>] [-o <output>] \ - [-f configfile] \ - [--col<N> <colspec>] \ - [--prop<N> <property>] \ - <[-d <delim>] \ - [-C] [files...]" - -Reads csv files from stdin or given files -if -d is given, use this delimiter -if -s is given, skips N lines at the start -Creates a URI from the columns given to -i, or automatically by numbering if -none is given -Outputs RDFS labels from the columns given to -l -if -c is given adds a type triple with the given classname -if -C is given, the class is defined as rdfs:Class -Outputs one RDF triple per column in each row. -Output is in n3 format. -Output is stdout, unless -o is specified - -Long options also supported: \ - --base, \ - --propbase, \ - --ident, \ - --class, \ - --label, \ - --out, \ - --defineclass - -Long options --col0, --col1, ... -can be used to specify conversion for columns. -Conversions can be: - float(), int(), split(sep, [more]), uri(base, [class]), date(format) - -Long options --prop0, --prop1, ... -can be used to use specific properties, rather than ones auto-generated -from the headers - --f says to read config from a .ini/config file - the file must contain one -section called csv2rdf, with keys like the long options, i.e.: - -[csv2rdf] -out=output.n3 -base=http://example.org/ -col0=split(";") -col1=split(";", uri("http://example.org/things/", - "http://xmlns.com/foaf/0.1/Person")) -col2=float() -col3=int() -col4=date("%Y-%b-%d %H:%M:%S") - -""" - -# bah - ugly global -uris = {} - - -def toProperty(label): - """ - CamelCase + lowercase inital a string - - - FIRST_NM => firstNm - - firstNm => firstNm - - """ - label = re.sub("[^\w]", " ", label) - label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) - label = label.split(" ") - return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) - - -def toPropertyLabel(label): - if not label[1:2].isupper(): - return label[0:1].lower() + label[1:] - return label - - -def index(l, i): - """return a set of indexes from a list - >>> index([1,2,3],(0,2)) - (1, 3) - """ - return tuple([l[x] for x in i]) - - -def csv_reader(csv_data, dialect=csv.excel, **kwargs): - - csv_reader = csv.reader(csv_data, - dialect=dialect, **kwargs) - for row in csv_reader: - # decode UTF-8 back to Unicode, cell by cell: - yield [str(cell, 'utf-8', errors='replace') for cell in row] - - -def prefixuri(x, prefix, class_=None): - if prefix: - r = rdflib.URIRef( - prefix + urllib.parse.quote( - x.encode("utf8").replace(" ", "_"), safe="")) - else: - r = rdflib.URIRef(x) - uris[x] = (r, class_) - return r - -# meta-language for config - - -class NodeMaker(object): - def range(self): - return rdflib.RDFS.Literal - - def __call__(self, x): - return rdflib.Literal(x) - - -class NodeUri(NodeMaker): - def __init__(self, prefix, class_): - self.prefix = prefix - if class_: - self.class_ = rdflib.URIRef(class_) - else: - self.class_ = None - - def __call__(self, x): - return prefixuri(x, self.prefix, self.class_) - - def range(self): - return self.class_ or rdflib.RDF.Resource - - -class NodeLiteral(NodeMaker): - def __init__(self, f=None): - self.f = f - - -class NodeFloat(NodeLiteral): - def __call__(self, x): - if not self.f: - return rdflib.Literal(float(x)) - if callable(self.f): - return rdflib.Literal(float(self.f(x))) - raise Exception("Function passed to float is not callable") - - def range(self): - return rdflib.XSD.double - - -class NodeInt(NodeLiteral): - def __call__(self, x): - if not self.f: - return rdflib.Literal(int(x)) - if callable(self.f): - return rdflib.Literal(int(self.f(x))) - raise Exception("Function passed to int is not callable") - - def range(self): - return rdflib.XSD.int - - -class NodeReplace(NodeMaker): - def __init__(self, a, b): - self.a = a - self.b = b - - def __call__(self, x): - return x.replace(self.a, self.b) - - -class NodeDate(NodeLiteral): - def __call__(self, x): - return rdflib.Literal(datetime.datetime.strptime(x, self.f)) - - def range(self): - return rdflib.XSD.dateTime - - -class NodeSplit(NodeMaker): - def __init__(self, sep, f): - self.sep = sep - self.f = f - - def __call__(self, x): - if not self.f: - self.f = rdflib.Literal - if not callable(self.f): - raise Exception("Function passed to split is not callable!") - return [ - self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] - - def range(self): - if self.f and isinstance(self.f, NodeMaker): - return self.f.range() - return NodeMaker.range(self) - -default_node_make = NodeMaker() - - -def _config_ignore(*args, **kwargs): - return "ignore" - - -def _config_uri(prefix=None, class_=None): - return NodeUri(prefix, class_) - - -def _config_literal(): - return NodeLiteral - - -def _config_float(f=None): - return NodeFloat(f) - - -def _config_replace(a, b): - return NodeReplace(a, b) - - -def _config_int(f=None): - return NodeInt(f) - - -def _config_date(format_): - return NodeDate(format_) - - -def _config_split(sep=None, f=None): - return NodeSplit(sep, f) - -config_functions = {"ignore": _config_ignore, - "uri": _config_uri, - "literal": _config_literal, - "float": _config_float, - "int": _config_int, - "date": _config_date, - "split": _config_split, - "replace": _config_replace - } - - -def column(v): - """Return a function for column mapping""" - - return eval(v, config_functions) - - -class CSV2RDF(object): - def __init__(self): - - self.CLASS = None - self.BASE = None - self.PROPBASE = None - self.IDENT = 'auto' - self.LABEL = None - self.DEFINECLASS = False - self.SKIP = 0 - self.DELIM = "," - - self.COLUMNS = {} - self.PROPS = {} - - self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') - - self.triples = 0 - - def triple(self, s, p, o): - self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) - self.triples += 1 - - def convert(self, csvreader): - - start = time.time() - - if self.OUT: - sys.stderr.write("Output to %s\n" % self.OUT.name) - - if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): - self.IDENT = (self.IDENT,) - - if not self.BASE: - warnings.warn("No base given, using http://example.org/instances/") - self.BASE = rdflib.Namespace("http://example.org/instances/") - - if not self.PROPBASE: - warnings.warn( - "No property base given, using http://example.org/property/") - self.PROPBASE = rdflib.Namespace("http://example.org/props/") - - # skip lines at the start - for x in range(self.SKIP): - next(csvreader) - - # read header line - header_labels = list(next(csvreader)) - headers = dict( - enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) - # override header properties if some are given - for k, v in self.PROPS.items(): - headers[k] = v - header_labels[k] = split_uri(v)[1] - - if self.DEFINECLASS: - # output class/property definitions - self.triple(self.CLASS, RDF.type, RDFS.Class) - for i in range(len(headers)): - h, l = headers[i], header_labels[i] - if h == "" or l == "": - continue - if self.COLUMNS.get(i) == _config_ignore: - continue - self.triple(h, RDF.type, RDF.Property) - self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) - self.triple(h, RDFS.domain, self.CLASS) - self.triple(h, RDFS.range, - self.COLUMNS.get(i, default_node_make).range()) - - rows = 0 - for l in csvreader: - try: - if self.IDENT == 'auto': - uri = self.BASE["%d" % rows] - else: - uri = self.BASE["_".join([urllib.parse.quote(x.encode( - "utf8").replace(" ", "_"), safe="") - for x in index(l, self.IDENT)])] - - if self.LABEL: - self.triple(uri, RDFS.label, rdflib.Literal( - " ".join(index(l, self.LABEL)))) - - if self.CLASS: - # type triple - self.triple(uri, RDF.type, self.CLASS) - - for i, x in enumerate(l): - x = x.strip() - if x != '': - if self.COLUMNS.get(i) == _config_ignore: - continue - try: - o = self.COLUMNS.get(i, rdflib.Literal)(x) - if isinstance(o, list): - for _o in o: - self.triple(uri, headers[i], _o) - else: - self.triple(uri, headers[i], o) - - except Exception as e: - warnings.warn( - "Could not process value for column " + - "%d:%s in row %d, ignoring: %s " % ( - i, headers[i], rows, e.message)) - - rows += 1 - if rows % 100000 == 0: - sys.stderr.write( - "%d rows, %d triples, elapsed %.2fs.\n" % ( - rows, self.triples, time.time() - start)) - except: - sys.stderr.write("Error processing line: %d\n" % rows) - raise - - # output types/labels for generated URIs - classes = set() - for l, x in uris.items(): - u, c = x - self.triple(u, RDFS.label, rdflib.Literal(l)) - if c: - c = rdflib.URIRef(c) - classes.add(c) - self.triple(u, RDF.type, c) - - for c in classes: - self.triple(c, RDF.type, RDFS.Class) - - self.OUT.close() - sys.stderr.write( - "Converted %d rows into %d triples.\n" % (rows, self.triples)) - sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) - - -def main(): - csv2rdf = CSV2RDF() - - opts, files = getopt.getopt( - sys.argv[1:], - "hc:b:p:i:o:Cf:l:s:d:", - ["out=", "base=", "delim=", "propbase=", "class=", - "ident=", "label=", "skip=", "defineclass", "help"]) - opts = dict(opts) - - if "-h" in opts or "--help" in opts: - print(HELP) - sys.exit(-1) - - if "-f" in opts: - config = configparser.ConfigParser() - config.readfp(open(opts["-f"])) - for k, v in config.items("csv2rdf"): - if k == "out": - csv2rdf.OUT = codecs.open(v, "w", "utf-8") - elif k == "base": - csv2rdf.BASE = rdflib.Namespace(v) - elif k == "propbase": - csv2rdf.PROPBASE = rdflib.Namespace(v) - elif k == "class": - csv2rdf.CLASS = rdflib.URIRef(v) - elif k == "defineclass": - csv2rdf.DEFINECLASS = bool(v) - elif k == "ident": - csv2rdf.IDENT = eval(v) - elif k == "label": - csv2rdf.LABEL = eval(v) - elif k == "delim": - csv2rdf.DELIM = v - elif k == "skip": - csv2rdf.SKIP = int(v) - elif k.startswith("col"): - csv2rdf.COLUMNS[int(k[3:])] = column(v) - elif k.startswith("prop"): - csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) - - if "-o" in opts: - csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") - if "--out" in opts: - csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") - - if "-b" in opts: - csv2rdf.BASE = rdflib.Namespace(opts["-b"]) - if "--base" in opts: - csv2rdf.BASE = rdflib.Namespace(opts["--base"]) - - if "-d" in opts: - csv2rdf.DELIM = opts["-d"] - if "--delim" in opts: - csv2rdf.DELIM = opts["--delim"] - - if "-p" in opts: - csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) - if "--propbase" in opts: - csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) - - if "-l" in opts: - csv2rdf.LABEL = eval(opts["-l"]) - if "--label" in opts: - csv2rdf.LABEL = eval(opts["--label"]) - - if "-i" in opts: - csv2rdf.IDENT = eval(opts["-i"]) - if "--ident" in opts: - csv2rdf.IDENT = eval(opts["--ident"]) - - if "-s" in opts: - csv2rdf.SKIP = int(opts["-s"]) - if "--skip" in opts: - csv2rdf.SKIP = int(opts["--skip"]) - - if "-c" in opts: - csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) - if "--class" in opts: - csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) - - for k, v in opts.items(): - if k.startswith("--col"): - csv2rdf.COLUMNS[int(k[5:])] = column(v) - elif k.startswith("--prop"): - csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) - - if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): - csv2rdf.DEFINECLASS = True - - csv2rdf.convert( - csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) - - -if __name__ == '__main__': - main()
