Skip to content

Commit 9e137bb

Browse files
committed
test: expand UTF-16 testing to JRuby
and make sure we're using the BOM correctly
1 parent 97cde04 commit 9e137bb

File tree

1 file changed

+35
-12
lines changed

1 file changed

+35
-12
lines changed

Diff for: test/xml/test_document_encoding.rb

+35-12
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ class TestDocumentEncoding < Nokogiri::TestCase
88
describe "Nokogiri::XML::Document encoding" do
99
let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
1010
let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }
11+
let(:utf16_document) do
12+
# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
13+
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
14+
size = 8000
15+
<<~XML.encode(Encoding::UTF_16)
16+
<?xml version="1.0" encoding="UTF-16"?>
17+
<root>
18+
<bar>#{"A" * size}</bar>
19+
</root>
20+
XML
21+
end
1122

1223
describe "#encoding" do
1324
it "describes the document's encoding correctly" do
@@ -44,25 +55,37 @@ class TestDocumentEncoding < Nokogiri::TestCase
4455
assert_equal(Encoding::UTF_8, Nokogiri::LIBXML_COMPILED_VERSION.encoding)
4556
assert_equal(Encoding::UTF_8, Nokogiri::LIBXSLT_COMPILED_VERSION.encoding)
4657
end
58+
59+
it "parses and serializes UTF-16 correctly" do
60+
xml = <<~XML.encode(Encoding::UTF_16)
61+
<?xml version="1.0" encoding="UTF-16"?>
62+
<root><bar>A</bar></root>
63+
XML
64+
output = Nokogiri::XML(xml).to_xml
65+
output_doc = Nokogiri::XML(output)
66+
67+
# these are descriptive, not prescriptive. the difference is whitespace. this may change
68+
# as implementations change. the intention is to verify that they're _roughly_ the right
69+
# length, they're not zero or half-width or double-width.
70+
expected_bytesize = Nokogiri.jruby? ? 132 : 142
71+
72+
assert_equal(Encoding::UTF_16, output.encoding)
73+
assert_equal("UTF-16", output_doc.encoding)
74+
assert_equal(expected_bytesize, output.bytesize)
75+
output_doc.at_xpath("/root/bar/text()").tap do |node|
76+
assert(node, "unexpected DOM structure in #{output.inspect}")
77+
assert_equal("A", node.content)
78+
end
4779
end
4880

4981
it "serializes UTF-16 correctly across libxml2 buffer flushes" do
5082
# https://github.com/sparklemotion/nokogiri/issues/752
5183
skip_unless_libxml2
5284

53-
# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
54-
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
55-
size = 8000
56-
input = String.new(<<~XML, encoding: "UTF-16")
57-
<?xml version="1.0" encoding="UTF-16"?>
58-
<root>
59-
<bar>#{"A" * size}</bar>
60-
</root>
61-
XML
62-
expected_length = (input.bytesize * 2) + 2 # double character width, add BOM bytes 0xFEFF
85+
output = Nokogiri::XML(utf16_document).to_xml
6386

64-
output = Nokogiri::XML(input).to_xml
65-
assert_equal(expected_length, output.bytesize)
87+
assert_equal(Encoding::UTF_16, output.encoding)
88+
assert_equal(utf16_document.bytesize, output.bytesize)
6689
end
6790
end
6891
end

0 commit comments

Comments
 (0)