Skip to content

Commit

Permalink
#428 Improve support for reading UTF-16 encoded source files
Browse files Browse the repository at this point in the history
  • Loading branch information
pepijnve committed Dec 26, 2023
1 parent a7f716f commit 733f41c
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 28 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

Bugfixes::

* Issue #428: Improve support for reading UTF-16 encoded source files
* Issue #449: Fix resolving of relative `!include` paths in Structurizr DSL input.

== 2.2.14
Expand Down
43 changes: 18 additions & 25 deletions lib/asciidoctor-diagram/diagram_source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,7 @@ def should_process?(image_file, image_metadata)

def load_code
if @file_name
lines = File.readlines(@file_name)
lines = prepare_source_array(lines)
lines = prepare_source_array(File.read(@file_name, :mode => 'rb'))
@parent_block.apply_subs(lines, resolve_diagram_subs).join("\n")
else
''
Expand All @@ -315,39 +314,33 @@ def load_code

private

# Byte arrays for UTF-* Byte Order Marks
BOM_BYTES_UTF_8 = [0xef, 0xbb, 0xbf]
BOM_BYTES_UTF_16LE = [0xff, 0xfe]
BOM_BYTES_UTF_16BE = [0xfe, 0xff]
# Raw binary strings for UTF-* Byte Order Marks
BOM_BYTES_UTF_8 = String.new("\xef\xbb\xbf", :encoding => Encoding::ASCII_8BIT)
BOM_BYTES_UTF_16LE = String.new("\xff\xfe", :encoding => Encoding::ASCII_8BIT)
BOM_BYTES_UTF_16BE = String.new("\xfe\xff", :encoding => Encoding::ASCII_8BIT)

# Prepare the source data Array for parsing.
# Prepare the source data for parsing.
#
# Encodes the data to UTF-8, if necessary, and removes any trailing
# Encodes the data to UTF-8 and removes any trailing
# whitespace from every line.
#
# If a BOM is found at the beginning of the data, a best attempt is made to
# encode it to UTF-8 from the specified source encoding.
#
# data - the source data Array to prepare (no nil entries allowed)
# data - the source data to prepare
#
# returns a String Array of prepared lines
def prepare_source_array data
return [] if data.empty?
if (leading_2_bytes = (leading_bytes = (first = data[0]).unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE
data[0] = first.byteslice 2, first.bytesize
# NOTE you can't split a UTF-16LE string using .lines when encoding is UTF-8; doing so will cause this line to fail
return data.map {|line| (line.encode ::Encoding::UTF_8, ::Encoding::UTF_16LE).rstrip}
elsif leading_2_bytes == BOM_BYTES_UTF_16BE
data[0] = first.byteslice 2, first.bytesize
return data.map {|line| (line.encode ::Encoding::UTF_8, ::Encoding::UTF_16BE).rstrip}
elsif leading_bytes == BOM_BYTES_UTF_8
data[0] = first.byteslice 3, first.bytesize
end
if first.encoding == ::Encoding::UTF_8
data.map {|line| line.rstrip}

if data.start_with?(BOM_BYTES_UTF_16LE)
utf8_data = data.byteslice(2, data.bytesize).encode(::Encoding::UTF_8, ::Encoding::UTF_16LE)
elsif data.start_with?(BOM_BYTES_UTF_16BE)
utf8_data = data.byteslice(2, data.bytesize).encode(::Encoding::UTF_8, ::Encoding::UTF_16BE)
elsif data.start_with?(BOM_BYTES_UTF_8)
utf8_data = data.byteslice(3, data.bytesize).encode(::Encoding::UTF_8)
else
data.map {|line| (line.encode ::Encoding::UTF_8).rstrip}
utf8_data = data.encode(::Encoding::UTF_8)
end

utf8_data.lines.map {|line| line.rstrip}
end
end
end
Expand Down
35 changes: 32 additions & 3 deletions spec/shared_examples.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
== First Section
#{name}::#{name}.txt[#{format}]
eos
eos

d = load_asciidoc doc, :safe => 'server', :attributes => {'backend' => 'html5'}
expect(d).to_not be_nil
Expand Down Expand Up @@ -217,6 +217,35 @@
expect(File.exist?("test2/foobaz.#{formats[0]}")).to be true
expect(File.exist?("#{name}.#{formats[0]}")).to be false
end

{
::Encoding::UTF_16LE => String.new("\xff\xfe", :encoding => Encoding::ASCII_8BIT),
::Encoding::UTF_16BE => String.new("\xfe\xff", :encoding => Encoding::ASCII_8BIT),
::Encoding::UTF_8 => String.new("\xef\xbb\xbf", :encoding => Encoding::ASCII_8BIT)
}.each_pair do |encoding, bom|
it "should support #{encoding.name} encoded source files" do
File.open("#{name}.txt", 'wb') do |f|
f.write bom
f.write code.encode(encoding).b
end

doc = <<-eos
= Hello, #{name}!
Doc Writer <[email protected]>
== First Section
.This is a diagram
#{name}::#{name}.txt[]
eos

d = load_asciidoc doc
expect(d).to_not be_nil

b = d.find { |bl| bl.context == :image }
expect(b).to_not be_nil
end
end
end

RSpec.shared_examples "inline_macro" do |name, code, formats|
Expand Down Expand Up @@ -269,7 +298,7 @@
== First Section
#{name}::#{name}.txt[#{format}]
eos
eos

d = load_asciidoc doc, :safe => 'server', :attributes => {'backend' => 'html5'}
expect(d).to_not be_nil
Expand Down Expand Up @@ -473,7 +502,7 @@
== First Section
#{name}::#{name}.txt[#{format}]
eos
eos

d = load_asciidoc doc, :safe => 'server', :attributes => {'backend' => 'html5'}
expect(d).to_not be_nil
Expand Down

0 comments on commit 733f41c

Please sign in to comment.