Skip to content

Commit

Permalink
Attempt to assume binary is UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Oct 9, 2024
1 parent 6f77baa commit 343197e
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 12 deletions.
29 changes: 27 additions & 2 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ class Source
def self.for(source, start_line = 1, offsets = [])
if source.ascii_only?
ASCIISource.new(source, start_line, offsets)
elsif source.encoding == Encoding::BINARY
source.force_encoding(Encoding::UTF_8)

if source.valid_encoding?
new(source, start_line, offsets)
else
# This is an extremely niche use case where the file is marked as
# binary, contains multi-byte characters, and those characters are not
# valid UTF-8. In this case we'll mark it as binary and fall back to
# treating everything as a single-byte character. This _may_ cause
# problems when asking for code units, but it appears to be the
# cleanest solution at the moment.
source.force_encoding(Encoding::BINARY)
ASCIISource.new(source, start_line, offsets)
end
else
new(source, start_line, offsets)
end
Expand Down Expand Up @@ -89,6 +104,12 @@ def character_column(byte_offset)
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
#
# We purposefully replace invalid and undefined characters with replacement
# characters in this conversion. This happens for two reasons. First, it's
# possible that the given byte offset will not occur on a character
# boundary. Second, it's possible that the source code will contain a
# character that has no equivalent in the given encoding.
def code_units_offset(byte_offset, encoding)
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)

Expand Down Expand Up @@ -130,8 +151,12 @@ def find_line(byte_offset)

# Specialized version of Prism::Source for source code that includes ASCII
# characters only. This class is used to apply performance optimizations that
# cannot be applied to sources that include multibyte characters. Sources that
# include multibyte characters are represented by the Prism::Source class.
# cannot be applied to sources that include multibyte characters.
#
# In the extremely rare case that a source includes multi-byte characters but
# is marked as binary because of a magic encoding comment and it cannot be
# eagerly converted to UTF-8, this class will be used as well. This is because
# at that point we will treat everything as single-byte characters.
class ASCIISource < Source
# Return the character offset for the given byte offset.
def character_offset(byte_offset)
Expand Down
11 changes: 11 additions & 0 deletions templates/lib/prism/serialize.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,21 @@ module Prism
def self.load(input, serialized)
input = input.dup
source = Source.for(input)

loader = Loader.new(source, serialized)
result = loader.load_result

input.force_encoding(loader.encoding)

# This is an extremely niche use-case where the file was marked as binary
# but it contained UTF-8-encoded characters. In that case we will actually
# put it back to UTF-8 to give the location APIs the best chance of being
# correct.
if !input.ascii_only? && input.encoding == Encoding::BINARY
input.force_encoding(Encoding::UTF_8)
input.force_encoding(Encoding::BINARY) unless input.valid_encoding?
end

result
end

Expand Down
33 changes: 23 additions & 10 deletions test/prism/ruby/location_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -140,23 +140,36 @@ def test_code_units
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end

def test_code_units_handles_binary_encoding_with_multibyte_characters
# If the encoding is set to binary and the source contains multibyte
# characters, we avoid breaking the code unit offsets, but they will
# still be incorrect.

def test_code_units_binary_valid_utf8
program = Prism.parse(<<~RUBY).value
# -*- encoding: binary -*-
πŸ˜€ + πŸ˜€
RUBY

# first πŸ˜€
location = program.statements.body.first.receiver.location
receiver = program.statements.body.first.receiver
assert_equal "πŸ˜€".b.to_sym, receiver.name

location = receiver.location
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
end

assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
def test_code_units_binary_invalid_utf8
program = Prism.parse(<<~RUBY).value
# -*- encoding: binary -*-
\x90 + \x90
RUBY

receiver = program.statements.body.first.receiver
assert_equal "\x90".b.to_sym, receiver.name

location = receiver.location
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 1, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
end

def test_chop
Expand Down

0 comments on commit 343197e

Please sign in to comment.