Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion spec/std/xml/html_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ describe XML do
it "parses html5 (#1404)" do
html5 = "<html><body><nav>Test</nav></body></html>"
xml = XML.parse_html(html5)
xml.errors.should_not be_nil
xml.xpath_node("//html/body/nav").should_not be_nil
end

Expand Down
15 changes: 8 additions & 7 deletions spec/std/xml/reader_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -567,15 +567,16 @@ module XML
reader.to_unsafe.should be_a(LibXML::XMLTextReader)
end
end
end

describe "#errors" do
it "makes errors accessible" do
reader = XML::Reader.new(%(<people></foo>))
reader.read
reader.expand?
describe "#errors" do
it "makes errors accessible" do
options = XML::ParserOptions::RECOVER | XML::ParserOptions::NONET
reader = XML::Reader.new(%(<people></foo>), options)
reader.read
reader.expand?

reader.errors.map(&.to_s).should eq ["Opening and ending tag mismatch: people line 1 and foo"]
reader.errors.map(&.to_s).should eq ["Opening and ending tag mismatch: people line 1 and foo"]
end
end
end
end
4 changes: 3 additions & 1 deletion spec/std/xml/xml_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ describe XML do
end

it "#errors" do
xml = XML.parse(%(<people></foo>))
options = XML::ParserOptions::RECOVER | XML::ParserOptions::NONET

xml = XML.parse(%(<people></foo>), options)
xml.root.not_nil!.name.should eq("people")
xml.errors.try(&.map(&.to_s)).should eq ["Opening and ending tag mismatch: people line 1 and foo"]

Expand Down
108 changes: 67 additions & 41 deletions src/xml.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require "./xml/libxml2"

# The XML module allows parsing and generating [XML](https://www.w3.org/XML/) documents.
#
# NOTE: To use `XML`, you must explicitly import it with `require "xml"`
Expand Down Expand Up @@ -54,77 +56,101 @@ module XML
# See `ParserOptions.default` for default options.
def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Node
raise XML::Error.new("Document is empty", 0) if string.empty?
from_ptr { LibXML.xmlReadMemory(string, string.bytesize, nil, nil, options) }
ctxt = LibXML.xmlNewParserCtxt
from_ptr(ctxt) do
LibXML.xmlCtxtReadMemory(ctxt, string, string.bytesize, nil, nil, options)
end
end

# Parses an XML document from *io* with *options* into an `XML::Node`.
#
# See `ParserOptions.default` for default options.
def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Node
from_ptr { LibXML.xmlReadIO(
->(ctx, buffer, len) {
LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len))
},
->(ctx) { 0 },
Box(IO).box(io),
nil,
nil,
options,
) }
ctxt = LibXML.xmlNewParserCtxt
from_ptr(ctxt) do
LibXML.xmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, nil, options)
end
end

# Parses an HTML document from *string* with *options* into an `XML::Node`.
#
# See `HTMLParserOptions.default` for default options.
def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Node
raise XML::Error.new("Document is empty", 0) if string.empty?
from_ptr { LibXML.htmlReadMemory(string, string.bytesize, nil, "utf-8", options) }
ctxt = LibXML.htmlNewParserCtxt
from_ptr(ctxt) do
LibXML.htmlCtxtReadMemory(ctxt, string, string.bytesize, nil, "utf-8", options)
end
end

# Parses an HTML document from *io* with *options* into an `XML::Node`.
#
# See `HTMLParserOptions.default` for default options.
def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Node
from_ptr { LibXML.htmlReadIO(
->(ctx, buffer, len) {
LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len))
},
->(ctx) { 0 },
Box(IO).box(io),
nil,
"utf-8",
options,
) }
ctxt = LibXML.htmlNewParserCtxt
from_ptr(ctxt) do
LibXML.htmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, "utf-8", options)
end
end

protected def self.from_ptr(& : -> LibXML::Doc*)
errors = [] of XML::Error
doc = XML::Error.collect(errors) { yield }
protected def self.read_callback(data : Void*, buffer : UInt8*, len : LibC::Int) : LibC::Int
io = Box(IO).unbox(data)
buf = Slice.new(buffer, len)
ret = {% if LibXML.has_method?(:xmlCtxtSetErrorHandler) %}
io.read(buf)
{% else %}
XML::Error.default_handlers { io.read(buf) }
{% end %}
LibC::Int.new(ret)
end

protected def self.close_callback(data : Void*) : LibC::Int
LibC::Int.new(0)
end

protected def self.from_ptr(ctxt, & : -> LibXML::Doc*)
errors = [] of XML::Error
doc =
{% if LibXML.has_method?(:xmlCtxtSetErrorHandler) %}
LibXML.xmlCtxtSetErrorHandler(ctxt, ->Error.structured_callback, Box.box(errors))
yield
{% else %}
XML::Error.unsafe_collect(errors) { yield }
{% end %}
raise Error.new(LibXML.xmlGetLastError) unless doc

Node.new(doc, errors)
end

protected def self.with_indent_tree_output(indent : Bool, &)
ptr = LibXML.__xmlIndentTreeOutput
old, ptr.value = ptr.value, indent ? 1 : 0
begin
yield
ensure
ptr.value = old
{% unless LibXML.has_method?(:xmlSaveSetIndentString) %}
# NOTE: These helpers are for internal compatibility with libxml < 2.14.

protected def self.with_indent_tree_output(indent : Bool, &)
save_indent_tree_output do
LibXML.__xmlIndentTreeOutput.value = indent ? 1 : 0
yield
end
end
end

protected def self.with_tree_indent_string(string : String, &)
ptr = LibXML.__xmlTreeIndentString
old, ptr.value = ptr.value, string.to_unsafe
begin
yield
ensure
ptr.value = old
protected def self.save_indent_tree_output(&)
value = LibXML.__xmlIndentTreeOutput.value
begin
yield
ensure
LibXML.__xmlIndentTreeOutput.value = value
end
end
end

protected def self.with_tree_indent_string(string : String, &)
value = LibXML.__xmlTreeIndentString.value
LibXML.__xmlTreeIndentString.value = string.to_unsafe
begin
yield
ensure
LibXML.__xmlTreeIndentString.value = value
end
end
{% end %}

class_getter libxml2_version : String do
version_string = String.new(LibXML.xmlParserVersion)
Expand Down
70 changes: 63 additions & 7 deletions src/xml/error.cr
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,83 @@ class XML::Error < Exception
{% raise "`XML::Error.errors` was removed because it leaks memory when it's not used. XML errors are accessible directly in the respective context via `XML::Reader#errors` and `XML::Node#errors`.\nSee https://github.com/crystal-lang/crystal/issues/14934 for details. " %}
end

def self.collect(errors, &)
LibXML.xmlSetStructuredErrorFunc Box.box(errors), ->(ctx, error) {
Box(Array(XML::Error)).unbox(ctx) << XML::Error.new(error)
}
protected def self.structured_callback(data : Void*, error : LibXML::Error*) : Nil
Box(Array(Error)).unbox(data) << Error.new(error)
end

protected def self.generic_callback(data : Void*, fmt : UInt8*) : Nil
message = String.new(fmt).chomp
Box(Array(Error)).unbox(data) << XML::Error.new(message, 0)
end

# Saves the global error handlers (and user data) for the current thread,
# replaces them with a custom handler to record reported XML errors in
# *errors*, and eventually restores the saved error handlers (and user data)
# before returning.
#
# Saves both structured + generic handlers because libxml < 2.13 use *both* in
# practice.
#
# NOTE: This is for internal compatibility with libxml < 2.13. Do not use.
protected def self.unsafe_collect(errors : Array(Error), &)
data = Box.box(errors)
with_handlers(data, ->structured_callback(Void*, LibXML::Error*), data, ->generic_callback(Void*, UInt8*)) { yield }
end

# Saves the current global error handlers (and user data) and restore the
# default handlers for the duration of the block. Eventually restores the
# saved error handlers (and user data) before returning.
#
# Use this when a callback can potentially do a fiber context switch, for
# example IO operations.
#
# Saves both structured + generic handlers because libxml < 2.13 use *both* in
# practice.
#
# NOTE: This is for internal compatibility with libxml < 2.13. Do not use.
protected def self.default_handlers(&)
with_handlers(nil, nil, nil, nil) { yield }
end

private def self.with_handlers(scontext, shandler, context, handler, &)
orig_scontext = LibXML.__xmlStructuredErrorContext.value
orig_shandler = LibXML.__xmlStructuredError.value

orig_context = LibXML.__xmlGenericErrorContext.value
orig_handler = LibXML.__xmlGenericError.value

LibXML.xmlSetStructuredErrorFunc(scontext, shandler)
LibXML.xmlSetGenericErrorFunc(context, handler)

begin
yield
ensure
LibXML.xmlSetStructuredErrorFunc nil, nil
# can't call xmlSetStructuredErrorFunc or xmlSetGenericErrorFunc: the
# compiler complains that it's passing a closure to C (it's not)
LibXML.__xmlStructuredErrorContext.value = orig_scontext
LibXML.__xmlStructuredError.value = orig_shandler

LibXML.__xmlGenericErrorContext.value = orig_context
LibXML.__xmlGenericError.value = orig_handler
end
end

@[Deprecated("Legacy libxml2 API that mutate global state. Do not use.")]
def self.collect(errors, &)
unsafe_collect(errors) { yield }
end

@[Deprecated("Legacy libxml2 API that mutate global state. Do not use.")]
def self.collect_generic(errors, &)
LibXML.xmlSetGenericErrorFunc Box.box(errors), ->(ctx, fmt) {
LibXML.xmlSetGenericErrorFunc Box.box(errors), ->(data, fmt) {
# TODO: use va_start and va_end to
message = String.new(fmt).chomp
error = XML::Error.new(message, 0)

{% if flag?(:arm) || flag?(:aarch64) %}
# libxml2 is likely missing ARM unwind tables (.ARM.extab and .ARM.exidx
# sections) which prevent raising from a libxml2 context.
Box(Array(XML::Error)).unbox(ctx) << error
Box(Array(XML::Error)).unbox(data) << error
{% else %}
raise error
{% end %}
Expand Down
45 changes: 43 additions & 2 deletions src/xml/libxml2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ require "./save_options"
{% end %}
{% end %}
lib LibXML
# The bindings default to libxml 2.9 that was released in 2012. We can safely
# assume at least this version is available everywhere.

{% if (version = env("LIBXML_VERSION")) && (version.strip != "") %}
VERSION = {{env("LIBXML_VERSION")}}
{% elsif !flag?(:win32) || flag?(:gnu) %}
VERSION = {{`sh -c "pkg-config libxml-2.0 --silence-errors --modversion 2> /dev/null || echo 2.9.0"`.strip.stringify}}
{% else %}
# TODO: figure out the actual libxml version on *-windows-msvc target
VERSION = "2.9.0"
{% end %}

alias Int = LibC::Int

$xmlParserVersion : LibC::Char*
Expand Down Expand Up @@ -69,6 +81,8 @@ lib LibXML
properties : Int
end

alias HTMLDoc = Doc

struct Attr
include NodeCommon
ns : NS*
Expand Down Expand Up @@ -97,6 +111,9 @@ lib LibXML
alias XMLTextReader = Void*
alias XMLTextReaderLocator = Void*

alias ParserCtxt = Void*
alias HTMLParserCtxt = ParserCtxt

enum ParserSeverity
VALIDITY_WARNING = 1
VALIDITY_ERROR = 2
Expand Down Expand Up @@ -134,7 +151,7 @@ lib LibXML
fun xmlTextReaderCurrentNode(reader : XMLTextReader) : Node*

fun xmlTextReaderSetErrorHandler(reader : XMLTextReader, f : TextReaderErrorFunc) : Void

fun xmlTextReaderSetStructuredErrorHandler(reader : XMLTextReader, f : StructuredErrorFunc, arg : Void*) : Void
fun xmlTextReaderLocatorLineNumber(XMLTextReaderLocator) : Int

fun xmlReadMemory(buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
Expand All @@ -146,6 +163,14 @@ lib LibXML
fun xmlReadIO(ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
fun htmlReadIO(ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*

fun xmlNewParserCtxt : ParserCtxt
fun xmlCtxtReadIO(ParserCtxt, ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
fun xmlCtxtReadMemory(ParserCtxt, buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*

fun htmlNewParserCtxt : HTMLParserCtxt
fun htmlCtxtReadMemory(HTMLParserCtxt, buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*
fun htmlCtxtReadIO(HTMLParserCtxt, ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*

fun xmlDocGetRootElement(doc : Doc*) : Node*
fun xmlXPathNodeSetCreate(node : Node*) : NodeSet*
fun xmlXPathNodeSetAddUnique(cur : NodeSet*, val : Node*) : Int
Expand Down Expand Up @@ -321,8 +346,15 @@ lib LibXML
alias StructuredErrorFunc = (Void*, Error*) ->
alias GenericErrorFunc = (Void*, UInt8*) ->

fun xmlSetStructuredErrorFunc(ctx : Void*, f : StructuredErrorFunc)
# deprecated
fun xmlSetGenericErrorFunc(ctx : Void*, f : GenericErrorFunc)
fun __xmlGenericError : GenericErrorFunc*
fun __xmlGenericErrorContext : Void**

# deprecated since 2.13
fun xmlSetStructuredErrorFunc(ctx : Void*, f : StructuredErrorFunc)
fun __xmlStructuredError : StructuredErrorFunc*
fun __xmlStructuredErrorContext : Void**

fun xmlGetNsList(doc : Doc*, node : Node*) : NS**

Expand All @@ -331,6 +363,15 @@ lib LibXML
fun xmlUnsetProp(node : Node*, name : UInt8*) : Int

fun xmlValidateNameValue(value : UInt8*) : Int

{% if compare_versions(LibXML::VERSION, "2.13.0") >= 0 %}
fun xmlCtxtSetErrorHandler(ctxt : ParserCtxt, handler : StructuredErrorFunc, data : Void*)
fun xmlXPathSetErrorHandler(ctxt : XPathContext*, handler : StructuredErrorFunc, data : Void*)
{% end %}

{% if compare_versions(LibXML::VERSION, "2.14.0") >= 0 %}
fun xmlSaveSetIndentString(SaveCtxPtr, UInt8*)
{% end %}
end

LibXML.xmlInitParser
Expand Down
Loading