@@ -8,6 +8,17 @@ class TestDocumentEncoding < Nokogiri::TestCase
8
8
describe "Nokogiri::XML::Document encoding" do
9
9
let ( :shift_jis_document ) { Nokogiri ::XML ( File . read ( SHIFT_JIS_XML ) , SHIFT_JIS_XML ) }
10
10
let ( :ascii_document ) { Nokogiri ::XML . parse ( File . read ( XML_FILE ) , XML_FILE ) }
11
+ let ( :utf16_document ) do
12
+ # the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
13
+ # is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
14
+ size = 8000
15
+ <<~XML . encode ( Encoding ::UTF_16 )
16
+ <?xml version="1.0" encoding="UTF-16"?>
17
+ <root>
18
+ <bar>#{ "A" * size } </bar>
19
+ </root>
20
+ XML
21
+ end
11
22
12
23
describe "#encoding" do
13
24
it "describes the document's encoding correctly" do
@@ -44,25 +55,37 @@ class TestDocumentEncoding < Nokogiri::TestCase
44
55
assert_equal ( Encoding ::UTF_8 , Nokogiri ::LIBXML_COMPILED_VERSION . encoding )
45
56
assert_equal ( Encoding ::UTF_8 , Nokogiri ::LIBXSLT_COMPILED_VERSION . encoding )
46
57
end
58
+
59
+ it "parses and serializes UTF-16 correctly" do
60
+ xml = <<~XML . encode ( Encoding ::UTF_16 )
61
+ <?xml version="1.0" encoding="UTF-16"?>
62
+ <root><bar>A</bar></root>
63
+ XML
64
+ output = Nokogiri ::XML ( xml ) . to_xml
65
+ output_doc = Nokogiri ::XML ( output )
66
+
67
+ # these are descriptive, not prescriptive. the difference is whitespace. this may change
68
+ # as implementations change. the intention is to verify that they're _roughly_ the right
69
+ # length, they're not zero or half-width or double-width.
70
+ expected_bytesize = Nokogiri . jruby? ? 132 : 142
71
+
72
+ assert_equal ( Encoding ::UTF_16 , output . encoding )
73
+ assert_equal ( "UTF-16" , output_doc . encoding )
74
+ assert_equal ( expected_bytesize , output . bytesize )
75
+ output_doc . at_xpath ( "/root/bar/text()" ) . tap do |node |
76
+ assert ( node , "unexpected DOM structure in #{ output . inspect } " )
77
+ assert_equal ( "A" , node . content )
78
+ end
47
79
end
48
80
49
81
it "serializes UTF-16 correctly across libxml2 buffer flushes" do
50
82
# https://github.com/sparklemotion/nokogiri/issues/752
51
83
skip_unless_libxml2
52
84
53
- # the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
54
- # is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
55
- size = 8000
56
- input = String . new ( <<~XML , encoding : "UTF-16" )
57
- <?xml version="1.0" encoding="UTF-16"?>
58
- <root>
59
- <bar>#{ "A" * size } </bar>
60
- </root>
61
- XML
62
- expected_length = ( input . bytesize * 2 ) + 2 # double character width, add BOM bytes 0xFEFF
85
+ output = Nokogiri ::XML ( utf16_document ) . to_xml
63
86
64
- output = Nokogiri :: XML ( input ) . to_xml
65
- assert_equal ( expected_length , output . bytesize )
87
+ assert_equal ( Encoding :: UTF_16 , output . encoding )
88
+ assert_equal ( utf16_document . bytesize , output . bytesize )
66
89
end
67
90
end
68
91
end
0 commit comments