Attempt to assume binary is UTF-8

ruby · Oct 9, 2024 · 343197e · 343197e
1 parent 6f77baa
commit 343197e
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 12 deletions.
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
@@ -12,6 +12,21 @@ class Source
     def self.for(source, start_line = 1, offsets = [])
       if source.ascii_only?
         ASCIISource.new(source, start_line, offsets)
+      elsif source.encoding == Encoding::BINARY
+        source.force_encoding(Encoding::UTF_8)
+
+        if source.valid_encoding?
+          new(source, start_line, offsets)
+        else
+          # This is an extremely niche use case where the file is marked as
+          # binary, contains multi-byte characters, and those characters are not
+          # valid UTF-8. In this case we'll mark it as binary and fall back to
+          # treating everything as a single-byte character. This _may_ cause
+          # problems when asking for code units, but it appears to be the
+          # cleanest solution at the moment.
+          source.force_encoding(Encoding::BINARY)
+          ASCIISource.new(source, start_line, offsets)
+        end
       else
         new(source, start_line, offsets)
       end
@@ -89,6 +104,12 @@ def character_column(byte_offset)
     # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
     # concept of code units that differs from the number of characters in other
     # encodings, it is not captured here.
+    #
+    # We purposefully replace invalid and undefined characters with replacement
+    # characters in this conversion. This happens for two reasons. First, it's
+    # possible that the given byte offset will not occur on a character
+    # boundary. Second, it's possible that the source code will contain a
+    # character that has no equivalent in the given encoding.
     def code_units_offset(byte_offset, encoding)
       byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
 
@@ -130,8 +151,12 @@ def find_line(byte_offset)
 
   # Specialized version of Prism::Source for source code that includes ASCII
   # characters only. This class is used to apply performance optimizations that
-  # cannot be applied to sources that include multibyte characters. Sources that
-  # include multibyte characters are represented by the Prism::Source class.
+  # cannot be applied to sources that include multibyte characters.
+  #
+  # In the extremely rare case that a source includes multi-byte characters but
+  # is marked as binary because of a magic encoding comment and it cannot be
+  # eagerly converted to UTF-8, this class will be used as well. This is because
+  # at that point we will treat everything as single-byte characters.
   class ASCIISource < Source
     # Return the character offset for the given byte offset.
     def character_offset(byte_offset)

diff --git a/templates/lib/prism/serialize.rb.erb b/templates/lib/prism/serialize.rb.erb
@@ -20,10 +20,21 @@ module Prism
     def self.load(input, serialized)
       input = input.dup
       source = Source.for(input)
+
       loader = Loader.new(source, serialized)
       result = loader.load_result
 
       input.force_encoding(loader.encoding)
+
+      # This is an extremely niche use-case where the file was marked as binary
+      # but it contained UTF-8-encoded characters. In that case we will actually
+      # put it back to UTF-8 to give the location APIs the best chance of being
+      # correct.
+      if !input.ascii_only? && input.encoding == Encoding::BINARY
+        input.force_encoding(Encoding::UTF_8)
+        input.force_encoding(Encoding::BINARY) unless input.valid_encoding?
+      end
+
       result
     end
 

diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
@@ -140,23 +140,36 @@ def test_code_units
       assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
     end
 
-    def test_code_units_handles_binary_encoding_with_multibyte_characters
-      # If the encoding is set to binary and the source contains multibyte
-      # characters, we avoid breaking the code unit offsets, but they will
-      # still be incorrect.
-
+    def test_code_units_binary_valid_utf8
       program = Prism.parse(<<~RUBY).value
         # -*- encoding: binary -*-
 
         😀 + 😀
       RUBY
 
-      # first 😀
-      location = program.statements.body.first.receiver.location
+      receiver = program.statements.body.first.receiver
+      assert_equal "😀".b.to_sym, receiver.name
+
+      location = receiver.location
+      assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
+      assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
+      assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
+    end
 
-      assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
-      assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
-      assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
+    def test_code_units_binary_invalid_utf8
+      program = Prism.parse(<<~RUBY).value
+        # -*- encoding: binary -*-
+
+        \x90 + \x90
+      RUBY
+
+      receiver = program.statements.body.first.receiver
+      assert_equal "\x90".b.to_sym, receiver.name
+
+      location = receiver.location
+      assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
+      assert_equal 1, location.end_code_units_column(Encoding::UTF_16LE)
+      assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
     end
 
     def test_chop