From 5c806a23a89a92687e82c03958b4a5d178b3c83a Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Mon, 13 Jul 2015 15:09:12 +0200 Subject: [PATCH 1/3] test for #494 (canonicalization sometimes collapses BNodes) --- test/test_canonicalization.py | 107 ++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py index 2745f4903..30fb0bb1e 100644 --- a/test/test_canonicalization.py +++ b/test/test_canonicalization.py @@ -37,10 +37,10 @@ def negative_graph_match_test(): True ], [ unicode('''@prefix : . - :linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]], + :linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]], [ :related [ :related :linear_two_step_symmatry_end]].'''), unicode('''@prefix : . - :linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]], + :linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]], [ :related [ :related :linear_two_step_symmatry_end]].'''), True ], @@ -68,7 +68,7 @@ def negative_graph_match_test(): ].'''), False ], - # This test fails because the algorithm purposefully breaks the symmetry of symetric + # This test fails because the algorithm purposefully breaks the symmetry of symetric [ unicode('''@prefix : . _:a :rel [ :rel [ @@ -148,4 +148,103 @@ def fn(rdf1, rdf2, identical): print digest2 assert (digest1 == digest2) == identical for inputs in testInputs: - yield fn, inputs[0], inputs[1], inputs[2] \ No newline at end of file + yield fn, inputs[0], inputs[1], inputs[2] + +def test_issue494_collapsing_bnodes(): + """Test for https://github.com/RDFLib/rdflib/issues/494 collapsing BNodes""" + g = Graph() + g += [ + (BNode('Na1a8fbcf755f41c1b5728f326be50994'), + RDF['object'], + URIRef(u'source')), + (BNode('Na1a8fbcf755f41c1b5728f326be50994'), + RDF['predicate'], + BNode('vcb3')), + (BNode('Na1a8fbcf755f41c1b5728f326be50994'), + RDF['subject'], + BNode('vcb2')), + (BNode('Na1a8fbcf755f41c1b5728f326be50994'), + RDF['type'], + RDF['Statement']), + (BNode('Na713b02f320d409c806ff0190db324f4'), + RDF['object'], + URIRef(u'target')), + (BNode('Na713b02f320d409c806ff0190db324f4'), + RDF['predicate'], + BNode('vcb0')), + (BNode('Na713b02f320d409c806ff0190db324f4'), + RDF['subject'], + URIRef(u'source')), + (BNode('Na713b02f320d409c806ff0190db324f4'), + RDF['type'], + RDF['Statement']), + (BNode('Ndb804ba690a64b3dbb9063c68d5e3550'), + RDF['object'], + BNode('vr0KcS4')), + (BNode('Ndb804ba690a64b3dbb9063c68d5e3550'), + RDF['predicate'], + BNode('vrby3JV')), + (BNode('Ndb804ba690a64b3dbb9063c68d5e3550'), + RDF['subject'], + URIRef(u'source')), + (BNode('Ndb804ba690a64b3dbb9063c68d5e3550'), + RDF['type'], + RDF['Statement']), + (BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'), + RDF['object'], + URIRef(u'source')), + (BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'), + RDF['predicate'], + BNode('vcb5')), + (BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'), + RDF['subject'], + URIRef(u'target')), + (BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'), + RDF['type'], + RDF['Statement']), + (BNode('Nec6864ef180843838aa9805bac835c98'), + RDF['object'], + URIRef(u'source')), + (BNode('Nec6864ef180843838aa9805bac835c98'), + RDF['predicate'], + BNode('vcb4')), + (BNode('Nec6864ef180843838aa9805bac835c98'), + RDF['subject'], + URIRef(u'source')), + (BNode('Nec6864ef180843838aa9805bac835c98'), + RDF['type'], + RDF['Statement']), + ] + + print 'graph length: %d, nodes: %d' % (len(g), len(g.all_nodes())) + print 'triple_bnode degrees:' + for triple_bnode in g.subjects(RDF['type'], RDF['Statement']): + print len(list(g.triples([triple_bnode, None, None]))) + print 'all node degrees:' + g_node_degs = sorted([ + len(list(g.triples([node, None, None]))) + for node in g.all_nodes() + ], reverse=True) + print g_node_degs + + cg = to_canonical_graph(g) + print 'graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())) + print 'triple_bnode degrees:' + for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']): + print len(list(cg.triples([triple_bnode, None, None]))) + print 'all node degrees:' + cg_node_degs = sorted([ + len(list(cg.triples([node, None, None]))) + for node in cg.all_nodes() + ], reverse=True) + print cg_node_degs + + assert len(g) == len(cg), \ + 'canonicalization changed number of triples in graph' + assert len(g.all_nodes()) == len(cg.all_nodes()), \ + 'canonicalization changed number of nodes in graph' + assert len(list(g.subjects(RDF['type'], RDF['Statement']))) == \ + len(list(cg.subjects(RDF['type'], RDF['Statement']))), \ + 'canonicalization changed number of statements' + assert g_node_degs == cg_node_degs, \ + 'canonicalization changed node degrees' From da3e0d8c8da1ef6be4c946317157faf2b6928933 Mon Sep 17 00:00:00 2001 From: Jim McCusker Date: Sun, 9 Aug 2015 18:39:08 -0400 Subject: [PATCH 2/3] Fixed bnode collision bug. --- rdflib/compare.py | 9 ++++++--- test/test_canonicalization.py | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/rdflib/compare.py b/rdflib/compare.py index 0f93639c5..6cb23f85b 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -204,7 +204,9 @@ def stringify(x): return unicode(x) if isinstance(color, Node): return stringify(color) - value = sum(map(self.hashfunc, ' '.join([stringify(x) for x in color]))) + value = 0 + for triple in color: + value += self.hashfunc(' '.join([stringify(x) for x in triple])) val = u"%x" % value self._hash_cache[color] = val return val @@ -290,7 +292,7 @@ def _initial_color(self): def _individuate(self, color, individual): new_color = list(color.color) - new_color.append((len(color.nodes))) + new_color.append(tuple([len(color.nodes)])) color.nodes.remove(individual) c = Color([individual], self.hashfunc, tuple(new_color), @@ -320,6 +322,7 @@ def _refine(self, coloring, sequence): sequence = sequence[:si] + colors + sequence[si+1:] except ValueError: sequence = colors[1:] + sequence + return coloring @_runtime("to_hash_runtime") @@ -407,7 +410,6 @@ def _traces(self, coloring, stats=None, depth=[0]): stats['prunings'] += 1 discrete = [x for x in best if self._discrete(x)] if len(discrete) == 0: - very_best = None best_score = None best_depth = None for coloring in best: @@ -434,6 +436,7 @@ def canonical_triples(self, stats=None): if stats is not None: stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring) stats['initial_color_count'] = len(coloring) + if not self._discrete(coloring): depth = [0] coloring = self._traces(coloring, stats=stats, depth=depth) diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py index 30fb0bb1e..87b5eeaa9 100644 --- a/test/test_canonicalization.py +++ b/test/test_canonicalization.py @@ -144,7 +144,9 @@ def negative_graph_match_test(): def fn(rdf1, rdf2, identical): digest1 = get_digest_value(rdf1,"text/turtle") digest2 = get_digest_value(rdf2,"text/turtle") + print rdf1 print digest1 + print rdf2 print digest2 assert (digest1 == digest2) == identical for inputs in testInputs: From b8df01f9bc53ccd01d61953108c30afae0f4b36e Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Mon, 10 Aug 2015 19:41:32 +0200 Subject: [PATCH 3/3] minor: tuple literal --- rdflib/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdflib/compare.py b/rdflib/compare.py index 6cb23f85b..06ba97505 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -292,7 +292,7 @@ def _initial_color(self): def _individuate(self, color, individual): new_color = list(color.color) - new_color.append(tuple([len(color.nodes)])) + new_color.append((len(color.nodes),)) color.nodes.remove(individual) c = Color([individual], self.hashfunc, tuple(new_color),