comparison env/lib/python3.9/site-packages/rdflib/compare.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # -*- coding: utf-8 -*-
2 """
3 A collection of utilities for canonicalizing and inspecting graphs.
4
5 Among other things, they solve of the problem of deterministic bnode
6 comparisons.
7
8 Warning: the time to canonicalize bnodes may increase exponentially on
9 degenerate larger graphs. Use with care!
10
11 Example of comparing two graphs::
12
13 >>> g1 = Graph().parse(format='n3', data='''
14 ... @prefix : <http://example.org/ns#> .
15 ... <http://example.org> :rel
16 ... <http://example.org/same>,
17 ... [ :label "Same" ],
18 ... <http://example.org/a>,
19 ... [ :label "A" ] .
20 ... ''')
21 >>> g2 = Graph().parse(format='n3', data='''
22 ... @prefix : <http://example.org/ns#> .
23 ... <http://example.org> :rel
24 ... <http://example.org/same>,
25 ... [ :label "Same" ],
26 ... <http://example.org/b>,
27 ... [ :label "B" ] .
28 ... ''')
29 >>>
30 >>> iso1 = to_isomorphic(g1)
31 >>> iso2 = to_isomorphic(g2)
32
33 These are not isomorphic::
34
35 >>> iso1 == iso2
36 False
37
38 Diff the two graphs::
39
40 >>> in_both, in_first, in_second = graph_diff(iso1, iso2)
41
42 Present in both::
43
44 >>> def dump_nt_sorted(g):
45 ... for l in sorted(g.serialize(format='nt').splitlines()):
46 ... if l: print(l.decode('ascii'))
47
48 >>> dump_nt_sorted(in_both) #doctest: +SKIP
49 <http://example.org>
50 <http://example.org/ns#rel> <http://example.org/same> .
51 <http://example.org>
52 <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
53 _:cbcaabaaba17fecbc304a64f8edee4335e
54 <http://example.org/ns#label> "Same" .
55
56 Only in first::
57
58 >>> dump_nt_sorted(in_first) #doctest: +SKIP
59 <http://example.org>
60 <http://example.org/ns#rel> <http://example.org/a> .
61 <http://example.org>
62 <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
63 _:cb124e4c6da0579f810c0ffe4eff485bd9
64 <http://example.org/ns#label> "A" .
65
66 Only in second::
67
68 >>> dump_nt_sorted(in_second) #doctest: +SKIP
69 <http://example.org>
70 <http://example.org/ns#rel> <http://example.org/b> .
71 <http://example.org>
72 <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
73 _:cb558f30e21ddfc05ca53108348338ade8
74 <http://example.org/ns#label> "B" .
75 """
76 from __future__ import absolute_import
77 from __future__ import division
78 from __future__ import print_function
79
80
81 # TODO:
82 # - Doesn't handle quads.
83 # - Add warning and/or safety mechanism before working on large graphs?
84 # - use this in existing Graph.isomorphic?
85
86 __all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic',
87 'to_canonical_graph', 'graph_diff', 'similar']
88
89 from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate
90 from rdflib.term import BNode, Node
91 from hashlib import sha256
92
93 from datetime import datetime
94 from collections import defaultdict
95
96 from six import text_type
97
98
99 def _total_seconds(td):
100 result = td.days * 24 * 60 * 60
101 result += td.seconds
102 result += td.microseconds / 1000000.0
103 return result
104
105
106 class _runtime(object):
107 def __init__(self, label):
108 self.label = label
109
110 def __call__(self, f):
111 if self.label is None:
112 self.label = f.__name__ + "_runtime"
113
114 def wrapped_f(*args, **kwargs):
115 start = datetime.now()
116 result = f(*args, **kwargs)
117 if 'stats' in kwargs and kwargs['stats'] is not None:
118 stats = kwargs['stats']
119 stats[self.label] = _total_seconds(datetime.now() - start)
120 return result
121 return wrapped_f
122
123
124 class _call_count(object):
125 def __init__(self, label):
126 self.label = label
127
128 def __call__(self, f):
129 if self.label is None:
130 self.label = f.__name__ + "_runtime"
131
132 def wrapped_f(*args, **kwargs):
133 if 'stats' in kwargs and kwargs['stats'] is not None:
134 stats = kwargs['stats']
135 if self.label not in stats:
136 stats[self.label] = 0
137 stats[self.label] += 1
138 return f(*args, **kwargs)
139 return wrapped_f
140
141
142 class IsomorphicGraph(ConjunctiveGraph):
143 """An implementation of the RGDA1 graph digest algorithm.
144
145 An implementation of RGDA1 (publication below),
146 a combination of Sayers & Karp's graph digest algorithm using
147 sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf>
148 and traces <http://pallini.di.uniroma1.it>, an average case
149 polynomial time algorithm for graph canonicalization.
150
151 McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web.
152 Rensselaer Polytechnic Institute, Troy, NY.
153 http://gradworks.umi.com/3727015.pdf
154 """
155
156 def __init__(self, **kwargs):
157 super(IsomorphicGraph, self).__init__(**kwargs)
158
159 def __eq__(self, other):
160 """Graph isomorphism testing."""
161 if not isinstance(other, IsomorphicGraph):
162 return False
163 elif len(self) != len(other):
164 return False
165 return self.internal_hash() == other.internal_hash()
166
167 def __ne__(self, other):
168 """Negative graph isomorphism testing."""
169 return not self.__eq__(other)
170
171 def __hash__(self):
172 return super(IsomorphicGraph, self).__hash__()
173
174 def graph_digest(self, stats=None):
175 """Synonym for IsomorphicGraph.internal_hash."""
176 return self.internal_hash(stats=stats)
177
178 def internal_hash(self, stats=None):
179 """
180 This is defined instead of __hash__ to avoid a circular recursion
181 scenario with the Memory store for rdflib which requires a hash lookup
182 in order to return a generator of triples.
183 """
184 return _TripleCanonicalizer(self).to_hash(stats=stats)
185
186
187 class Color:
188 def __init__(self, nodes, hashfunc, color=(), hash_cache=None):
189 if hash_cache is None:
190 hash_cache = {}
191 self._hash_cache = hash_cache
192 self.color = color
193 self.nodes = nodes
194 self.hashfunc = hashfunc
195 self._hash_color = None
196
197 def __str__(self):
198 nodes, color = self.key()
199 return "Color %s (%s nodes)" % (color, nodes)
200
201 def key(self):
202 return (len(self.nodes), self.hash_color())
203
204 def hash_color(self, color=None):
205 if color is None:
206 color = self.color
207 if color in self._hash_cache:
208 return self._hash_cache[color]
209
210 def stringify(x):
211 if isinstance(x, Node):
212 return x.n3()
213 else:
214 return text_type(x)
215 if isinstance(color, Node):
216 return stringify(color)
217 value = 0
218 for triple in color:
219 value += self.hashfunc(' '.join([stringify(x) for x in triple]))
220 val = u"%x" % value
221 self._hash_cache[color] = val
222 return val
223
224 def distinguish(self, W, graph):
225 colors = {}
226 for n in self.nodes:
227 new_color = list(self.color)
228 for node in W.nodes:
229 new_color += [
230 (1, p, W.hash_color())
231 for s, p, o in graph.triples((n, None, node))]
232 new_color += [
233 (W.hash_color(), p, 3)
234 for s, p, o in graph.triples((node, None, n))]
235 new_color = tuple(new_color)
236 new_hash_color = self.hash_color(new_color)
237
238 if new_hash_color not in colors:
239 c = Color(
240 [], self.hashfunc, new_color,
241 hash_cache=self._hash_cache)
242 colors[new_hash_color] = c
243 colors[new_hash_color].nodes.append(n)
244 return colors.values()
245
246 def discrete(self):
247 return len(self.nodes) == 1
248
249 def copy(self):
250 return Color(
251 self.nodes[:], self.hashfunc, self.color,
252 hash_cache=self._hash_cache)
253
254
255 class _TripleCanonicalizer(object):
256
257 def __init__(self, graph, hashfunc=sha256):
258 self.graph = graph
259
260 def _hashfunc(s):
261 h = hashfunc()
262 h.update(text_type(s).encode("utf8"))
263 return int(h.hexdigest(), 16)
264 self._hash_cache = {}
265 self.hashfunc = _hashfunc
266
267 def _discrete(self, coloring):
268 return len([c for c in coloring if not c.discrete()]) == 0
269
270 def _initial_color(self):
271 """Finds an initial color for the graph.
272
273 Finds an initial color fo the graph by finding all blank nodes and
274 non-blank nodes that are adjacent. Nodes that are not adjacent to blank
275 nodes are not included, as they are a) already colored (by URI or literal)
276 and b) do not factor into the color of any blank node.
277 """
278 bnodes = set()
279 others = set()
280 self._neighbors = defaultdict(set)
281 for s, p, o in self.graph:
282 nodes = set([s, p, o])
283 b = set([x for x in nodes if isinstance(x, BNode)])
284 if len(b) > 0:
285 others |= nodes - b
286 bnodes |= b
287 if isinstance(s, BNode):
288 self._neighbors[s].add(o)
289 if isinstance(o, BNode):
290 self._neighbors[o].add(s)
291 if isinstance(p, BNode):
292 self._neighbors[p].add(s)
293 self._neighbors[p].add(p)
294 if len(bnodes) > 0:
295 return [
296 Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
297 ] + [
298 Color([x], self.hashfunc, x, hash_cache=self._hash_cache)
299 for x in others
300 ]
301 else:
302 return []
303
304 def _individuate(self, color, individual):
305 new_color = list(color.color)
306 new_color.append((len(color.nodes),))
307
308 color.nodes.remove(individual)
309 c = Color([individual], self.hashfunc, tuple(new_color),
310 hash_cache=self._hash_cache)
311 return c
312
313 def _get_candidates(self, coloring):
314 candidates = [c for c in coloring if not c.discrete()]
315 for c in [c for c in coloring if not c.discrete()]:
316 for node in c.nodes:
317 yield node, c
318
319 def _refine(self, coloring, sequence):
320 sequence = sorted(sequence, key=lambda x: x.key(), reverse=True)
321 coloring = coloring[:]
322 while len(sequence) > 0 and not self._discrete(coloring):
323 W = sequence.pop()
324 for c in coloring[:]:
325 if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode):
326 colors = sorted(c.distinguish(W, self.graph),
327 key=lambda x: x.key(),
328 reverse=True)
329 coloring.remove(c)
330 coloring.extend(colors)
331 try:
332 si = sequence.index(c)
333 sequence = sequence[:si] + colors + sequence[si+1:]
334 except ValueError:
335 sequence = colors[1:] + sequence
336 combined_colors = []
337 combined_color_map = dict()
338 for color in coloring:
339 color_hash = color.hash_color()
340 # This is a hash collision, and be combined into a single color for individuation.
341 if color_hash in combined_color_map:
342 combined_color_map[color_hash].nodes.extend(color.nodes)
343 else:
344 combined_colors.append(color)
345 combined_color_map[color_hash] = color
346 return combined_colors
347
348 @_runtime("to_hash_runtime")
349 def to_hash(self, stats=None):
350 result = 0
351 for triple in self.canonical_triples(stats=stats):
352 result += self.hashfunc(' '.join([x.n3() for x in triple]))
353 if stats is not None:
354 stats['graph_digest'] = "%x" % result
355 return result
356
357 def _experimental_path(self, coloring):
358 coloring = [c.copy() for c in coloring]
359 while not self._discrete(coloring):
360 color = [x for x in coloring if not x.discrete()][0]
361 node = color.nodes[0]
362 new_color = self._individuate(color, node)
363 coloring.append(new_color)
364 coloring = self._refine(coloring, [new_color])
365 return coloring
366
367 def _create_generator(self, colorings, groupings=None):
368 if not groupings:
369 groupings = defaultdict(set)
370 for group in zip(*colorings):
371 g = set([c.nodes[0] for c in group])
372 for n in group:
373 g |= groupings[n]
374 for n in g:
375 groupings[n] = g
376 return groupings
377
378 @_call_count("individuations")
379 def _traces(self, coloring, stats=None, depth=[0]):
380 if stats is not None and 'prunings' not in stats:
381 stats['prunings'] = 0
382 depth[0] += 1
383 candidates = self._get_candidates(coloring)
384 best = []
385 best_score = None
386 best_experimental = None
387 best_experimental_score = None
388 last_coloring = None
389 generator = defaultdict(set)
390 visited = set()
391 for candidate, color in candidates:
392 if candidate in generator:
393 v = generator[candidate] & visited
394 if len(v) > 0:
395 visited.add(candidate)
396 continue
397 visited.add(candidate)
398 coloring_copy = []
399 color_copy = None
400 for c in coloring:
401 c_copy = c.copy()
402 coloring_copy.append(c_copy)
403 if c == color:
404 color_copy = c_copy
405 new_color = self._individuate(color_copy, candidate)
406 coloring_copy.append(new_color)
407 refined_coloring = self._refine(coloring_copy, [new_color])
408 color_score = tuple([c.key() for c in refined_coloring])
409 experimental = self._experimental_path(coloring_copy)
410 experimental_score = set([c.key() for c in experimental])
411 if last_coloring:
412 generator = self._create_generator(
413 [last_coloring, experimental],
414 generator)
415 last_coloring = experimental
416 if best_score is None or best_score < color_score:
417 best = [refined_coloring]
418 best_score = color_score
419 best_experimental = experimental
420 best_experimental_score = experimental_score
421 elif best_score > color_score:
422 # prune this branch.
423 if stats is not None:
424 stats['prunings'] += 1
425 elif experimental_score != best_experimental_score:
426 best.append(refined_coloring)
427 else:
428 # prune this branch.
429 if stats is not None:
430 stats['prunings'] += 1
431 discrete = [x for x in best if self._discrete(x)]
432 if len(discrete) == 0:
433 best_score = None
434 best_depth = None
435 for coloring in best:
436 d = [depth[0]]
437 new_color = self._traces(coloring, stats=stats, depth=d)
438 color_score = tuple([c.key() for c in refined_coloring])
439 if best_score is None or color_score > best_score:
440 discrete = [new_color]
441 best_score = color_score
442 best_depth = d[0]
443 depth[0] = best_depth
444 return discrete[0]
445
446 def canonical_triples(self, stats=None):
447 if stats is not None:
448 start_canonicalization = datetime.now()
449 if stats is not None:
450 start_coloring = datetime.now()
451 coloring = self._initial_color()
452 if stats is not None:
453 stats['triple_count'] = len(self.graph)
454 stats['adjacent_nodes'] = max(0, len(coloring) - 1)
455 coloring = self._refine(coloring, coloring[:])
456 if stats is not None:
457 stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
458 stats['initial_color_count'] = len(coloring)
459
460 if not self._discrete(coloring):
461 depth = [0]
462 coloring = self._traces(coloring, stats=stats, depth=depth)
463 if stats is not None:
464 stats['tree_depth'] = depth[0]
465 elif stats is not None:
466 stats['individuations'] = 0
467 stats['tree_depth'] = 0
468 if stats is not None:
469 stats['color_count'] = len(coloring)
470
471 bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
472 if stats is not None:
473 stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
474 for triple in self.graph:
475 result = tuple(self._canonicalize_bnodes(triple, bnode_labels))
476 yield result
477
478 def _canonicalize_bnodes(self, triple, labels):
479 for term in triple:
480 if isinstance(term, BNode):
481 yield BNode(value="cb%s" % labels[term])
482 else:
483 yield term
484
485
486 def to_isomorphic(graph):
487 if isinstance(graph, IsomorphicGraph):
488 return graph
489 result = IsomorphicGraph()
490 if hasattr(graph, "identifier"):
491 result = IsomorphicGraph(identifier=graph.identifier)
492 result += graph
493 return result
494
495
496 def isomorphic(graph1, graph2):
497 """Compare graph for equality.
498
499 Uses an algorithm to compute unique hashes which takes bnodes into account.
500
501 Examples::
502
503 >>> g1 = Graph().parse(format='n3', data='''
504 ... @prefix : <http://example.org/ns#> .
505 ... <http://example.org> :rel <http://example.org/a> .
506 ... <http://example.org> :rel <http://example.org/b> .
507 ... <http://example.org> :rel [ :label "A bnode." ] .
508 ... ''')
509 >>> g2 = Graph().parse(format='n3', data='''
510 ... @prefix ns: <http://example.org/ns#> .
511 ... <http://example.org> ns:rel [ ns:label "A bnode." ] .
512 ... <http://example.org> ns:rel <http://example.org/b>,
513 ... <http://example.org/a> .
514 ... ''')
515 >>> isomorphic(g1, g2)
516 True
517
518 >>> g3 = Graph().parse(format='n3', data='''
519 ... @prefix : <http://example.org/ns#> .
520 ... <http://example.org> :rel <http://example.org/a> .
521 ... <http://example.org> :rel <http://example.org/b> .
522 ... <http://example.org> :rel <http://example.org/c> .
523 ... ''')
524 >>> isomorphic(g1, g3)
525 False
526 """
527 gd1 = _TripleCanonicalizer(graph1).to_hash()
528 gd2 = _TripleCanonicalizer(graph2).to_hash()
529 return gd1 == gd2
530
531
532 def to_canonical_graph(g1, stats=None):
533 """Creates a canonical, read-only graph.
534
535 Creates a canonical, read-only graph where all bnode id:s are based on
536 deterministical SHA-256 checksums, correlated with the graph contents.
537 """
538 graph = Graph()
539 graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
540 return ReadOnlyGraphAggregate([graph])
541
542
543 def graph_diff(g1, g2):
544 """Returns three sets of triples: "in both", "in first" and "in second"."""
545 # bnodes have deterministic values in canonical graphs:
546 cg1 = to_canonical_graph(g1)
547 cg2 = to_canonical_graph(g2)
548 in_both = cg1 * cg2
549 in_first = cg1 - cg2
550 in_second = cg2 - cg1
551 return (in_both, in_first, in_second)
552
553
554 _MOCK_BNODE = BNode()
555
556
557 def similar(g1, g2):
558 """Checks if the two graphs are "similar".
559
560 Checks if the two graphs are "similar", by comparing sorted triples where
561 all bnodes have been replaced by a singular mock bnode (the
562 ``_MOCK_BNODE``).
563
564 This is a much cheaper, but less reliable, alternative to the comparison
565 algorithm in ``isomorphic``.
566 """
567 return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))
568
569
570 def _squashed_graphs_triples(g1, g2):
571 for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))):
572 yield t1, t2
573
574
575 def _squash_graph(graph):
576 return (_squash_bnodes(triple) for triple in graph)
577
578
579 def _squash_bnodes(triple):
580 return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)