diff env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,511 +0,0 @@
-"""
-A commandline tool for semi-automatically converting CSV to RDF
-
-try: ``csv2rdf --help``
-
-"""
-
-
-import sys
-import re
-import csv
-import getopt
-import configparser
-import fileinput
-import codecs
-import time
-import datetime
-import warnings
-import urllib.request, urllib.error, urllib.parse
-
-import rdflib
-
-from rdflib import RDF, RDFS
-from rdflib.namespace import split_uri
-
-__all__ = [ 'CSV2RDF' ]
-
-HELP = """
-csv2rdf.py \
-    -b <instance-base> \
-    -p <property-base> \
-    [-c <classname>] \
-    [-i <identity column(s)>] \
-    [-l <label columns>] \
-    [-s <N>] [-o <output>] \
-    [-f configfile] \
-    [--col<N> <colspec>] \
-    [--prop<N> <property>] \
-    <[-d <delim>] \
-    [-C] [files...]"
-
-Reads csv files from stdin or given files
-if -d is given, use this delimiter
-if -s is given, skips N lines at the start
-Creates a URI from the columns given to -i, or automatically by numbering if
-none is given
-Outputs RDFS labels from the columns given to -l
-if -c is given adds a type triple with the given classname
-if -C is given, the class is defined as rdfs:Class
-Outputs one RDF triple per column in each row.
-Output is in n3 format.
-Output is stdout, unless -o is specified
-
-Long options also supported: \
-    --base, \
-    --propbase, \
-    --ident, \
-    --class, \
-    --label, \
-    --out, \
-    --defineclass
-
-Long options --col0, --col1, ...
-can be used to specify conversion for columns.
-Conversions can be:
-    float(), int(), split(sep, [more]), uri(base, [class]), date(format)
-
-Long options --prop0, --prop1, ...
-can be used to use specific properties, rather than ones auto-generated
-from the headers
-
--f says to read config from a .ini/config file - the file must contain one
-section called csv2rdf, with keys like the long options, i.e.:
-
-[csv2rdf]
-out=output.n3
-base=http://example.org/
-col0=split(";")
-col1=split(";", uri("http://example.org/things/",
-                    "http://xmlns.com/foaf/0.1/Person"))
-col2=float()
-col3=int()
-col4=date("%Y-%b-%d %H:%M:%S")
-
-"""
-
-# bah - ugly global
-uris = {}
-
-
-def toProperty(label):
-    """
-    CamelCase + lowercase inital a string
-
-
-    FIRST_NM => firstNm
-
-    firstNm => firstNm
-
-    """
-    label = re.sub("[^\w]", " ", label)
-    label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
-    label = label.split(" ")
-    return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
-
-
-def toPropertyLabel(label):
-    if not label[1:2].isupper():
-        return label[0:1].lower() + label[1:]
-    return label
-
-
-def index(l, i):
-    """return a set of indexes from a list
-    >>> index([1,2,3],(0,2))
-    (1, 3)
-    """
-    return tuple([l[x] for x in i])
-
-
-def csv_reader(csv_data, dialect=csv.excel, **kwargs):
-
-    csv_reader = csv.reader(csv_data,
-                            dialect=dialect, **kwargs)
-    for row in csv_reader:
-        # decode UTF-8 back to Unicode, cell by cell:
-        yield [str(cell, 'utf-8', errors='replace') for cell in row]
-
-
-def prefixuri(x, prefix, class_=None):
-    if prefix:
-        r = rdflib.URIRef(
-            prefix + urllib.parse.quote(
-                x.encode("utf8").replace(" ", "_"), safe=""))
-    else:
-        r = rdflib.URIRef(x)
-    uris[x] = (r, class_)
-    return r
-
-# meta-language for config
-
-
-class NodeMaker(object):
-    def range(self):
-        return rdflib.RDFS.Literal
-
-    def __call__(self, x):
-        return rdflib.Literal(x)
-
-
-class NodeUri(NodeMaker):
-    def __init__(self, prefix, class_):
-        self.prefix = prefix
-        if class_:
-            self.class_ = rdflib.URIRef(class_)
-        else:
-            self.class_ = None
-
-    def __call__(self, x):
-        return prefixuri(x, self.prefix, self.class_)
-
-    def range(self):
-        return self.class_ or rdflib.RDF.Resource
-
-
-class NodeLiteral(NodeMaker):
-    def __init__(self, f=None):
-        self.f = f
-
-
-class NodeFloat(NodeLiteral):
-    def __call__(self, x):
-        if not self.f:
-            return rdflib.Literal(float(x))
-        if callable(self.f):
-            return rdflib.Literal(float(self.f(x)))
-        raise Exception("Function passed to float is not callable")
-
-    def range(self):
-        return rdflib.XSD.double
-
-
-class NodeInt(NodeLiteral):
-    def __call__(self, x):
-        if not self.f:
-            return rdflib.Literal(int(x))
-        if callable(self.f):
-            return rdflib.Literal(int(self.f(x)))
-        raise Exception("Function passed to int is not callable")
-
-    def range(self):
-        return rdflib.XSD.int
-
-
-class NodeReplace(NodeMaker):
-    def __init__(self, a, b):
-        self.a = a
-        self.b = b
-
-    def __call__(self, x):
-        return x.replace(self.a, self.b)
-
-
-class NodeDate(NodeLiteral):
-    def __call__(self, x):
-        return rdflib.Literal(datetime.datetime.strptime(x, self.f))
-
-    def range(self):
-        return rdflib.XSD.dateTime
-
-
-class NodeSplit(NodeMaker):
-    def __init__(self, sep, f):
-        self.sep = sep
-        self.f = f
-
-    def __call__(self, x):
-        if not self.f:
-            self.f = rdflib.Literal
-        if not callable(self.f):
-            raise Exception("Function passed to split is not callable!")
-        return [
-            self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
-
-    def range(self):
-        if self.f and isinstance(self.f, NodeMaker):
-            return self.f.range()
-        return NodeMaker.range(self)
-
-default_node_make = NodeMaker()
-
-
-def _config_ignore(*args, **kwargs):
-    return "ignore"
-
-
-def _config_uri(prefix=None, class_=None):
-    return NodeUri(prefix, class_)
-
-
-def _config_literal():
-    return NodeLiteral
-
-
-def _config_float(f=None):
-    return NodeFloat(f)
-
-
-def _config_replace(a, b):
-    return NodeReplace(a, b)
-
-
-def _config_int(f=None):
-    return NodeInt(f)
-
-
-def _config_date(format_):
-    return NodeDate(format_)
-
-
-def _config_split(sep=None, f=None):
-    return NodeSplit(sep, f)
-
-config_functions = {"ignore": _config_ignore,
-                    "uri": _config_uri,
-                    "literal": _config_literal,
-                    "float": _config_float,
-                    "int": _config_int,
-                    "date": _config_date,
-                    "split": _config_split,
-                    "replace": _config_replace
-                    }
-
-
-def column(v):
-    """Return a function for column mapping"""
-
-    return eval(v, config_functions)
-
-
-class CSV2RDF(object):
-    def __init__(self):
-
-        self.CLASS = None
-        self.BASE = None
-        self.PROPBASE = None
-        self.IDENT = 'auto'
-        self.LABEL = None
-        self.DEFINECLASS = False
-        self.SKIP = 0
-        self.DELIM = ","
-
-        self.COLUMNS = {}
-        self.PROPS = {}
-
-        self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace')
-
-        self.triples = 0
-
-    def triple(self, s, p, o):
-        self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
-        self.triples += 1
-
-    def convert(self, csvreader):
-
-        start = time.time()
-
-        if self.OUT:
-            sys.stderr.write("Output to %s\n" % self.OUT.name)
-
-        if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
-            self.IDENT = (self.IDENT,)
-
-        if not self.BASE:
-            warnings.warn("No base given, using http://example.org/instances/")
-            self.BASE = rdflib.Namespace("http://example.org/instances/")
-
-        if not self.PROPBASE:
-            warnings.warn(
-                "No property base given, using http://example.org/property/")
-            self.PROPBASE = rdflib.Namespace("http://example.org/props/")
-
-        # skip lines at the start
-        for x in range(self.SKIP):
-            next(csvreader)
-
-        # read header line
-        header_labels = list(next(csvreader))
-        headers = dict(
-            enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
-        # override header properties if some are given
-        for k, v in self.PROPS.items():
-            headers[k] = v
-            header_labels[k] = split_uri(v)[1]
-
-        if self.DEFINECLASS:
-            # output class/property definitions
-            self.triple(self.CLASS, RDF.type, RDFS.Class)
-            for i in range(len(headers)):
-                h, l = headers[i], header_labels[i]
-                if h == "" or l == "":
-                    continue
-                if self.COLUMNS.get(i) == _config_ignore:
-                    continue
-                self.triple(h, RDF.type, RDF.Property)
-                self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l)))
-                self.triple(h, RDFS.domain, self.CLASS)
-                self.triple(h, RDFS.range,
-                            self.COLUMNS.get(i, default_node_make).range())
-
-        rows = 0
-        for l in csvreader:
-            try:
-                if self.IDENT == 'auto':
-                    uri = self.BASE["%d" % rows]
-                else:
-                    uri = self.BASE["_".join([urllib.parse.quote(x.encode(
-                        "utf8").replace(" ", "_"), safe="")
-                        for x in index(l, self.IDENT)])]
-
-                if self.LABEL:
-                    self.triple(uri, RDFS.label, rdflib.Literal(
-                        " ".join(index(l, self.LABEL))))
-
-                if self.CLASS:
-                    # type triple
-                    self.triple(uri, RDF.type, self.CLASS)
-
-                for i, x in enumerate(l):
-                    x = x.strip()
-                    if x != '':
-                        if self.COLUMNS.get(i) == _config_ignore:
-                            continue
-                        try:
-                            o = self.COLUMNS.get(i, rdflib.Literal)(x)
-                            if isinstance(o, list):
-                                for _o in o:
-                                    self.triple(uri, headers[i], _o)
-                            else:
-                                self.triple(uri, headers[i], o)
-
-                        except Exception as e:
-                            warnings.warn(
-                                "Could not process value for column " +
-                                "%d:%s in row %d, ignoring: %s " % (
-                                i, headers[i], rows, e.message))
-
-                rows += 1
-                if rows % 100000 == 0:
-                    sys.stderr.write(
-                        "%d rows, %d triples, elapsed %.2fs.\n" % (
-                        rows, self.triples, time.time() - start))
-            except:
-                sys.stderr.write("Error processing line: %d\n" % rows)
-                raise
-
-        # output types/labels for generated URIs
-        classes = set()
-        for l, x in uris.items():
-            u, c = x
-            self.triple(u, RDFS.label, rdflib.Literal(l))
-            if c:
-                c = rdflib.URIRef(c)
-                classes.add(c)
-                self.triple(u, RDF.type, c)
-
-        for c in classes:
-            self.triple(c, RDF.type, RDFS.Class)
-
-        self.OUT.close()
-        sys.stderr.write(
-            "Converted %d rows into %d triples.\n" % (rows, self.triples))
-        sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
-
-
-def main():
-    csv2rdf = CSV2RDF()
-
-    opts, files = getopt.getopt(
-        sys.argv[1:],
-        "hc:b:p:i:o:Cf:l:s:d:",
-        ["out=", "base=", "delim=", "propbase=", "class=",
-         "ident=", "label=", "skip=", "defineclass", "help"])
-    opts = dict(opts)
-
-    if "-h" in opts or "--help" in opts:
-        print(HELP)
-        sys.exit(-1)
-
-    if "-f" in opts:
-        config = configparser.ConfigParser()
-        config.readfp(open(opts["-f"]))
-        for k, v in config.items("csv2rdf"):
-            if k == "out":
-                csv2rdf.OUT = codecs.open(v, "w", "utf-8")
-            elif k == "base":
-                csv2rdf.BASE = rdflib.Namespace(v)
-            elif k == "propbase":
-                csv2rdf.PROPBASE = rdflib.Namespace(v)
-            elif k == "class":
-                csv2rdf.CLASS = rdflib.URIRef(v)
-            elif k == "defineclass":
-                csv2rdf.DEFINECLASS = bool(v)
-            elif k == "ident":
-                csv2rdf.IDENT = eval(v)
-            elif k == "label":
-                csv2rdf.LABEL = eval(v)
-            elif k == "delim":
-                csv2rdf.DELIM = v
-            elif k == "skip":
-                csv2rdf.SKIP = int(v)
-            elif k.startswith("col"):
-                csv2rdf.COLUMNS[int(k[3:])] = column(v)
-            elif k.startswith("prop"):
-                csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
-
-    if "-o" in opts:
-        csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
-    if "--out" in opts:
-        csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
-
-    if "-b" in opts:
-        csv2rdf.BASE = rdflib.Namespace(opts["-b"])
-    if "--base" in opts:
-        csv2rdf.BASE = rdflib.Namespace(opts["--base"])
-
-    if "-d" in opts:
-        csv2rdf.DELIM = opts["-d"]
-    if "--delim" in opts:
-        csv2rdf.DELIM = opts["--delim"]
-
-    if "-p" in opts:
-        csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
-    if "--propbase" in opts:
-        csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
-
-    if "-l" in opts:
-        csv2rdf.LABEL = eval(opts["-l"])
-    if "--label" in opts:
-        csv2rdf.LABEL = eval(opts["--label"])
-
-    if "-i" in opts:
-        csv2rdf.IDENT = eval(opts["-i"])
-    if "--ident" in opts:
-        csv2rdf.IDENT = eval(opts["--ident"])
-
-    if "-s" in opts:
-        csv2rdf.SKIP = int(opts["-s"])
-    if "--skip" in opts:
-        csv2rdf.SKIP = int(opts["--skip"])
-
-    if "-c" in opts:
-        csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
-    if "--class" in opts:
-        csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
-
-    for k, v in opts.items():
-        if k.startswith("--col"):
-            csv2rdf.COLUMNS[int(k[5:])] = column(v)
-        elif k.startswith("--prop"):
-            csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
-
-    if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
-        csv2rdf.DEFINECLASS = True
-
-    csv2rdf.convert(
-        csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
-
-
-if __name__ == '__main__':
-    main()