From 343197e4ff07e6d45ab960ed55f1b73154229c91 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 9 Oct 2024 10:34:42 -0400 Subject: [PATCH] Attempt to assume binary is UTF-8 --- lib/prism/parse_result.rb | 29 ++++++++++++++++++++++-- templates/lib/prism/serialize.rb.erb | 11 ++++++++++ test/prism/ruby/location_test.rb | 33 +++++++++++++++++++--------- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index aea5dee9fa..e3ba7e7c8e 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -12,6 +12,21 @@ class Source def self.for(source, start_line = 1, offsets = []) if source.ascii_only? ASCIISource.new(source, start_line, offsets) + elsif source.encoding == Encoding::BINARY + source.force_encoding(Encoding::UTF_8) + + if source.valid_encoding? + new(source, start_line, offsets) + else + # This is an extremely niche use case where the file is marked as + # binary, contains multi-byte characters, and those characters are not + # valid UTF-8. In this case we'll mark it as binary and fall back to + # treating everything as a single-byte character. This _may_ cause + # problems when asking for code units, but it appears to be the + # cleanest solution at the moment. + source.force_encoding(Encoding::BINARY) + ASCIISource.new(source, start_line, offsets) + end else new(source, start_line, offsets) end @@ -89,6 +104,12 @@ def character_column(byte_offset) # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the # concept of code units that differs from the number of characters in other # encodings, it is not captured here. + # + # We purposefully replace invalid and undefined characters with replacement + # characters in this conversion. This happens for two reasons. First, it's + # possible that the given byte offset will not occur on a character + # boundary. Second, it's possible that the source code will contain a + # character that has no equivalent in the given encoding. def code_units_offset(byte_offset, encoding) byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace) @@ -130,8 +151,12 @@ def find_line(byte_offset) # Specialized version of Prism::Source for source code that includes ASCII # characters only. This class is used to apply performance optimizations that - # cannot be applied to sources that include multibyte characters. Sources that - # include multibyte characters are represented by the Prism::Source class. + # cannot be applied to sources that include multibyte characters. + # + # In the extremely rare case that a source includes multi-byte characters but + # is marked as binary because of a magic encoding comment and it cannot be + # eagerly converted to UTF-8, this class will be used as well. This is because + # at that point we will treat everything as single-byte characters. class ASCIISource < Source # Return the character offset for the given byte offset. def character_offset(byte_offset) diff --git a/templates/lib/prism/serialize.rb.erb b/templates/lib/prism/serialize.rb.erb index 9306ff7ee3..448135efec 100644 --- a/templates/lib/prism/serialize.rb.erb +++ b/templates/lib/prism/serialize.rb.erb @@ -20,10 +20,21 @@ module Prism def self.load(input, serialized) input = input.dup source = Source.for(input) + loader = Loader.new(source, serialized) result = loader.load_result input.force_encoding(loader.encoding) + + # This is an extremely niche use-case where the file was marked as binary + # but it contained UTF-8-encoded characters. In that case we will actually + # put it back to UTF-8 to give the location APIs the best chance of being + # correct. + if !input.ascii_only? && input.encoding == Encoding::BINARY + input.force_encoding(Encoding::UTF_8) + input.force_encoding(Encoding::BINARY) unless input.valid_encoding? + end + result end diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb index e360a0db72..3d3e7dd562 100644 --- a/test/prism/ruby/location_test.rb +++ b/test/prism/ruby/location_test.rb @@ -140,23 +140,36 @@ def test_code_units assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE) end - def test_code_units_handles_binary_encoding_with_multibyte_characters - # If the encoding is set to binary and the source contains multibyte - # characters, we avoid breaking the code unit offsets, but they will - # still be incorrect. - + def test_code_units_binary_valid_utf8 program = Prism.parse(<<~RUBY).value # -*- encoding: binary -*- 😀 + 😀 RUBY - # first 😀 - location = program.statements.body.first.receiver.location + receiver = program.statements.body.first.receiver + assert_equal "😀".b.to_sym, receiver.name + + location = receiver.location + assert_equal 1, location.end_code_units_column(Encoding::UTF_8) + assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE) + assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE) + end - assert_equal 4, location.end_code_units_column(Encoding::UTF_8) - assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE) - assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE) + def test_code_units_binary_invalid_utf8 + program = Prism.parse(<<~RUBY).value + # -*- encoding: binary -*- + + \x90 + \x90 + RUBY + + receiver = program.statements.body.first.receiver + assert_equal "\x90".b.to_sym, receiver.name + + location = receiver.location + assert_equal 1, location.end_code_units_column(Encoding::UTF_8) + assert_equal 1, location.end_code_units_column(Encoding::UTF_16LE) + assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE) end def test_chop