crystal-lang · straight-shoota · Jul 2, 2025 · Jun 23, 2025 · Jun 27, 2025
diff --git a/src/xml.cr b/src/xml.cr
@@ -54,7 +54,7 @@ module XML
   # Parses an XML document from *string* with *options* into an `XML::Node`.
   #
   # See `ParserOptions.default` for default options.
-  def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Node
+  def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Document
     raise XML::Error.new("Document is empty", 0) if string.empty?
     ctxt = LibXML.xmlNewParserCtxt
     from_ptr(ctxt) do
@@ -65,7 +65,7 @@ module XML
   # Parses an XML document from *io* with *options* into an `XML::Node`.
   #
   # See `ParserOptions.default` for default options.
-  def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Node
+  def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Document
     ctxt = LibXML.xmlNewParserCtxt
     from_ptr(ctxt) do
       LibXML.xmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, nil, options)
@@ -75,7 +75,7 @@ module XML
   # Parses an HTML document from *string* with *options* into an `XML::Node`.
   #
   # See `HTMLParserOptions.default` for default options.
-  def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Node
+  def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Document
     raise XML::Error.new("Document is empty", 0) if string.empty?
     ctxt = LibXML.htmlNewParserCtxt
     from_ptr(ctxt) do
@@ -86,7 +86,7 @@ module XML
   # Parses an HTML document from *io* with *options* into an `XML::Node`.
   #
   # See `HTMLParserOptions.default` for default options.
-  def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Node
+  def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Document
     ctxt = LibXML.htmlNewParserCtxt
     from_ptr(ctxt) do
       LibXML.htmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, "utf-8", options)
@@ -119,7 +119,7 @@ module XML
       {% end %}
     raise Error.new(LibXML.xmlGetLastError) unless doc
 
-    Node.new(doc, errors)
+    Document.new(doc, errors)
   end
 
   {% unless LibXML.has_method?(:xmlSaveSetIndentString) %}

diff --git a/src/xml/document.cr b/src/xml/document.cr
@@ -0,0 +1,94 @@
+require "weak_ref"
+
+class XML::Document < XML::Node
+  # :nodoc:
+  #
+  # The constructors allocate a XML::Node for a libxml node once, so we don't
+  # finalize a document twice for example.
+  #
+  # We store the reference into the libxml struct (_private) for documents
+  # because a document's XML::Node lives as long as its libxml doc. However we
+  # can lose references to subtree XML::Node, so using _private would leave
+  # dangling pointers. We thus keep a cache of weak references to all nodes in
+  # the document, so we can still collect lost references, and at worst
+  # reinstantiate a XML::Node if needed.
+  #
+  # NOTE: when a XML::Node is moved to another document, the XML::Node and any
+  # instantiated descendant XML::Node shall be cleaned from the original
+  # document's cache, and must be added to the new document's cache.
+  protected getter cache : Hash(LibXML::Node*, WeakRef(Node))
+
+  # :nodoc:
+  #
+  # Unlinked libxml nodes, and all their descendant nodes, don't appear in the
+  # document's tree anymore, and must be manually freed, yet we can't merely
+  # free the libxml node in a finalizer, because it would free the whole
+  # subtree, while we may still have live XML::Node instances.
+  #
+  # We keep an explicit list of unlinked libxml nodes. We can't rely on the
+  # cache because it uses weak references and the XML::Node could be collected,
+  # leaking the libxml node and its subtree.
+  #
+  # NOTE: the libxml node, along with any descendant shall be removed from the
+  # list when relinked into a tree, be it the same document or another.
+  protected getter unlinked_nodes : Set(LibXML::Node*)
+
+  # :nodoc:
+  def self.new(doc : LibXML::Doc*, errors : Array(Error)? = nil) : Document
+    if ptr = doc.value._private
+      ptr.as(Document)
+    else
+      new(doc_: doc, errors_: errors)
+    end
+  end
+
+  # Must never be called directly, use the constructors above.
+  private def initialize(*, doc_ : LibXML::Doc*, errors_ : Array(Error)?)
+    @node = doc_.as(LibXML::Node*)
+    @errors = errors_
+    @cache = Hash(LibXML::Node*, WeakRef(Node)).new
+    @unlinked_nodes = Set(LibXML::Node*).new
+    @document = self
+    doc_.value._private = self.as(Void*)
+  end
+
+  # :nodoc:
+  def finalize
+    # free unlinked nodes and their subtrees
+    @unlinked_nodes.each do |node|
+      if node.value.doc == @node
+        LibXML.xmlFreeNode(node)
+      else
+        # the node has been adopted into another document, don't free!
+      end
+    end
+
+    # free the doc and its subtree
+    LibXML.xmlFreeDoc(@node.as(LibXML::Doc*))
+  end
+
+  # Returns the encoding of this node's document.
+  def encoding : String?
+    if encoding = @node.as(LibXML::Doc*).value.encoding
+      String.new(encoding)
+    end
+  end
+
+  # Returns the version of this node's document.
+  def version : String?
+    if version = @node.as(LibXML::Doc*).value.version
+      String.new(version)
+    end
+  end
+
+  # :nodoc:
+  def document : Document
+    self
+  end
+
+  # Returns the list of `XML::Error` found when parsing this document.
+  # Returns `nil` if no errors were found.
+  def errors : Array(XML::Error)?
+    @errors unless @errors.try &.empty?
+  end
+end
diff --git a/src/xml/namespace.cr b/src/xml/namespace.cr
@@ -1,8 +1,14 @@
 class XML::Namespace
-  getter document : Node
+  getter document : Document
 
   # :nodoc:
-  def initialize(@document : Node, @ns : LibXML::NS*)
+  @[Deprecated]
+  def self.new(document : Node, ns : LibXML::NS*)
+    new(document.as(Document), ns)
+  end
+
+  # :nodoc:
+  def initialize(@document : Document, @ns : LibXML::NS*)
   end
 
   # See `Object#hash(hasher)`