From 94cdca85780a5b3616b91683773f75b85e4fe2ea Mon Sep 17 00:00:00 2001 From: Edmond Chuc Date: Wed, 3 Sep 2025 16:38:33 +1000 Subject: [PATCH 1/2] feat: canonicalization with longturtle serializer now optional Fixes https://github.com/RDFLib/rdflib/issues/3196 --- rdflib/plugins/serializers/longturtle.py | 45 ++++++++++++++----- .../test_serializer_longturtle.py | 2 +- .../test_serializer_longturtle_sort.py | 2 +- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index 95b94efd7..1315181b1 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -39,23 +39,20 @@ class LongTurtleSerializer(RecursiveSerializer): + """LongTurtle, a Turtle serialization format. + + When the optional parameter `canon` is set to True, the graph is canonicalized + before serialization. This normalizes blank node identifiers and allows for + deterministic serialization of the graph. Useful when consistent outputs are required. + """ + short_name = "longturtle" indentString = " " def __init__(self, store): self._ns_rewrite = {} - namespace_manager = store.namespace_manager - store = to_canonical_graph(store) - content = store.serialize(format="application/n-triples") - lines = content.split("\n") - lines.sort() - graph = Graph() - graph.parse( - data="\n".join(lines), format="application/n-triples", skolemize=True - ) - graph = graph.de_skolemize() - graph.namespace_manager = namespace_manager - super(LongTurtleSerializer, self).__init__(graph) + self._canon = False + super(LongTurtleSerializer, self).__init__(store) self.keywords = {RDF.type: "a"} self.reset() self.stream = None @@ -85,11 +82,34 @@ def addNamespace(self, prefix, namespace): super(LongTurtleSerializer, self).addNamespace(prefix, namespace) return prefix + def canonize(self): + """Apply canonicalization to the store. + + This normalizes blank node identifiers and allows for deterministic + serialization of the graph. + """ + if not self._canon: + return + + namespace_manager = self.store.namespace_manager + store = to_canonical_graph(self.store) + content = store.serialize(format="application/n-triples") + lines = content.split("\n") + lines.sort() + graph = Graph() + graph.parse( + data="\n".join(lines), format="application/n-triples", skolemize=True + ) + graph = graph.de_skolemize() + graph.namespace_manager = namespace_manager + self.store = graph + def reset(self): super(LongTurtleSerializer, self).reset() self._shortNames = {} self._started = False self._ns_rewrite = {} + self.canonize() def serialize( self, @@ -99,6 +119,7 @@ def serialize( spacious: Optional[bool] = None, **kwargs: Any, ) -> None: + self._canon = kwargs.get("canon", False) self.reset() self.stream = stream # if base is given here, use, if not and a base is set for the graph use that diff --git a/test/test_serializers/test_serializer_longturtle.py b/test/test_serializers/test_serializer_longturtle.py index c1761b6da..65821784e 100644 --- a/test/test_serializers/test_serializer_longturtle.py +++ b/test/test_serializers/test_serializer_longturtle.py @@ -167,7 +167,7 @@ def test_longturtle(): g.bind("sdo", SDO) # run the long turtle serializer - output = g.serialize(format="longturtle") + output = g.serialize(format="longturtle", canon=True) # fix the target current_dir = Path.cwd() # Get the current directory diff --git a/test/test_serializers/test_serializer_longturtle_sort.py b/test/test_serializers/test_serializer_longturtle_sort.py index fa192253d..044660e3e 100644 --- a/test/test_serializers/test_serializer_longturtle_sort.py +++ b/test/test_serializers/test_serializer_longturtle_sort.py @@ -62,7 +62,7 @@ def test_sort_semiblank_graph() -> None: graph.add((outer_node, EX.has, inner_node)) graph.add((inner_node, RDFS.seeAlso, nested)) - graph_text = graph.serialize(format="longturtle", sort=True) + graph_text = graph.serialize(format="longturtle", canon=True) if first_graph_text == "": first_graph_text = graph_text From 88bf615c6aa28941ba77ec6bacff52c78da1da60 Mon Sep 17 00:00:00 2001 From: Edmond Chuc Date: Wed, 3 Sep 2025 20:48:01 +1000 Subject: [PATCH 2/2] docs: fix docs build error by removing py obj reference to canon --- rdflib/plugins/serializers/longturtle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index 1315181b1..2aaed36e6 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -41,7 +41,7 @@ class LongTurtleSerializer(RecursiveSerializer): """LongTurtle, a Turtle serialization format. - When the optional parameter `canon` is set to True, the graph is canonicalized + When the optional parameter ``canon`` is set to :py:obj:`True`, the graph is canonicalized before serialization. This normalizes blank node identifiers and allows for deterministic serialization of the graph. Useful when consistent outputs are required. """