From a0c7fd9c12154f7cbccf7f7dd81fd811c9be1c9d Mon Sep 17 00:00:00 2001
From: Steve Cohen <scohen@scohen.org>
Date: Wed, 30 Nov 2022 20:21:11 -0800
Subject: [PATCH] Fixed off-by-one error that was vexing code unit conversions.

The problem was that the character positions are _before_ the reported
unit, so the 0th code unit is before the start of the line and the 1st
code unit is the first character. The prior code added one to
character counts to smooth this out, but you can't do that, because
you could end up indexing into the middle of a multibyte character.
---
 .../language_server/experimental/code_unit.ex |  8 +--
 .../experimental/source_file/conversions.ex   | 54 +++++++------------
 .../test/experimental/code_unit_test.exs      | 51 ++++++++++--------
 3 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/apps/language_server/lib/language_server/experimental/code_unit.ex b/apps/language_server/lib/language_server/experimental/code_unit.ex
index 14ad0b822..96ab4a658 100644
--- a/apps/language_server/lib/language_server/experimental/code_unit.ex
+++ b/apps/language_server/lib/language_server/experimental/code_unit.ex
@@ -44,12 +44,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do
 
   @spec to_utf8(String.t(), utf16_code_unit()) :: {:ok, utf8_code_unit()} | error
   def to_utf8(binary, utf16_unit) do
-    do_to_utf8(binary, utf16_unit + 1, 0)
+    do_to_utf8(binary, utf16_unit, 0)
   end
 
   @spec to_utf16(String.t(), utf8_code_unit()) :: {:ok, utf16_code_unit()} | error
   def to_utf16(binary, utf16_unit) do
-    do_to_utf16(binary, utf16_unit + 1, 0)
+    do_to_utf16(binary, utf16_unit, 0)
   end
 
   def count(:utf16, binary) do
@@ -98,7 +98,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do
   end
 
   defp do_to_utf16(_, 0, utf16_unit) do
-    {:ok, utf16_unit - 1}
+    {:ok, utf16_unit}
   end
 
   defp do_to_utf16(_, utf8_unit, _) when utf8_unit < 0 do
@@ -152,7 +152,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do
   end
 
   defp do_to_utf8(_, 0, utf8_unit) do
-    {:ok, utf8_unit - 1}
+    {:ok, utf8_unit}
   end
 
   defp do_to_utf8(_, utf_16_units, _) when utf_16_units < 0 do
diff --git a/apps/language_server/lib/language_server/experimental/source_file/conversions.ex b/apps/language_server/lib/language_server/experimental/source_file/conversions.ex
index 8a9f424e8..c019b74fb 100644
--- a/apps/language_server/lib/language_server/experimental/source_file/conversions.ex
+++ b/apps/language_server/lib/language_server/experimental/source_file/conversions.ex
@@ -59,10 +59,6 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
     position
   end
 
-  def to_elixir(%LSPosition{} = position, %SourceFile{} = source_file) do
-    to_elixir(position, source_file.document)
-  end
-
   def to_elixir(%LSPosition{} = position, %Document{} = document) do
     document_size = Document.size(document)
     # we need to handle out of bounds line numbers, because it's possible to build a document
@@ -85,17 +81,8 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
         {:ok, ElixirPosition.new(elixir_line_number, 0)}
 
       true ->
-        with {:ok, line} <- Document.fetch_line(document, elixir_line_number) do
-          elixir_character =
-            case line do
-              line(ascii?: true, text: text) ->
-                min(ls_character, byte_size(text))
-
-              line(text: text) ->
-                {:ok, utf16_text} = to_utf16(text)
-                lsp_character_to_elixir(utf16_text, ls_character)
-            end
-
+        with {:ok, line} <- Document.fetch_line(document, elixir_line_number),
+             {:ok, elixir_character} <- extract_elixir_character(position, line) do
           {:ok, ElixirPosition.new(elixir_line_number, elixir_character)}
         end
     end
@@ -127,20 +114,11 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
   end
 
   def to_lsp(%ElixirPosition{} = position, %Document{} = document) do
-    %ElixirPosition{character: elixir_character, line: elixir_line} = position
+    with {:ok, line} <- Document.fetch_line(document, position.line),
+         {:ok, lsp_character} <- extract_lsp_character(position, line) do
+      ls_pos =
+        LSPosition.new(character: lsp_character, line: position.line - @elixir_ls_index_base)
 
-    with {:ok, line} <- Document.fetch_line(document, elixir_line) do
-      lsp_character =
-        case line do
-          line(ascii?: true, text: text) ->
-            min(position.character, byte_size(text))
-
-          line(text: utf8_text) ->
-            {:ok, character} = elixir_character_to_lsp(utf8_text, elixir_character)
-            character
-        end
-
-      ls_pos = LSPosition.new(character: lsp_character, line: elixir_line - @elixir_ls_index_base)
       {:ok, ls_pos}
     end
   end
@@ -151,19 +129,27 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
 
   # Private
 
-  defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true)) do
-    {:ok, position.character}
+  defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true, text: text)) do
+    character = min(position.character, byte_size(text))
+    {:ok, character}
   end
 
   defp extract_lsp_character(%ElixirPosition{} = position, line(text: utf8_text)) do
-    {:ok, CodeUnit.utf16_offset(utf8_text, position.character)}
+    with {:ok, code_unit} <- CodeUnit.to_utf16(utf8_text, position.character) do
+      character = min(code_unit, CodeUnit.count(:utf16, utf8_text))
+      {:ok, character}
+    end
   end
 
-  defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true)) do
-    {:ok, position.character}
+  defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true, text: text)) do
+    character = min(position.character, byte_size(text))
+    {:ok, character}
   end
 
   defp extract_elixir_character(%LSPosition{} = position, line(text: utf8_text)) do
-    {:ok, CodeUnit.utf8_offset(utf8_text, position.character)}
+    with {:ok, code_unit} <- CodeUnit.to_utf8(utf8_text, position.character) do
+      character = min(code_unit, byte_size(utf8_text))
+      {:ok, character}
+    end
   end
 end
diff --git a/apps/language_server/test/experimental/code_unit_test.exs b/apps/language_server/test/experimental/code_unit_test.exs
index 69b5ac985..06b602065 100644
--- a/apps/language_server/test/experimental/code_unit_test.exs
+++ b/apps/language_server/test/experimental/code_unit_test.exs
@@ -64,6 +64,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
     end
 
     test "handles multi-byte characters properly" do
+      # guitar is 2 code units in utf16 but 4 in utf8
       line = "b🎸abc"
       assert 0 == utf16_offset(line, 0)
       assert 1 == utf16_offset(line, 1)
@@ -77,19 +78,21 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
 
   describe "converting to utf8" do
     test "bounds are respected" do
-      assert {:error, :out_of_bounds} = to_utf16("h", 1)
+      assert {:error, :out_of_bounds} = to_utf16("h", 2)
     end
 
     test "with a multi-byte character" do
       line = "🏳️‍🌈"
+
       code_unit_count = count_utf8_code_units(line)
 
-      assert to_utf8(line, 0) == {:error, :misaligned}
-      assert to_utf8(line, 1) == {:ok, 3}
-      assert to_utf8(line, 2) == {:ok, 6}
-      assert to_utf8(line, 3) == {:ok, 9}
-      assert to_utf8(line, 4) == {:error, :misaligned}
-      assert to_utf8(line, 5) == {:ok, code_unit_count - 1}
+      assert to_utf8(line, 0) == {:ok, 0}
+      assert to_utf8(line, 1) == {:error, :misaligned}
+      assert to_utf8(line, 2) == {:ok, 4}
+      assert to_utf8(line, 3) == {:ok, 7}
+      assert to_utf8(line, 4) == {:ok, 10}
+      assert to_utf8(line, 5) == {:error, :misaligned}
+      assert to_utf8(line, 6) == {:ok, code_unit_count}
     end
 
     test "after a unicode character" do
@@ -99,8 +102,8 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
       assert to_utf8(line, 1) == {:ok, 1}
       assert to_utf8(line, 4) == {:ok, 4}
       assert to_utf8(line, 5) == {:ok, 5}
-      assert to_utf8(line, 6) == {:error, :misaligned}
-      assert to_utf8(line, 7) == {:ok, 9}
+      assert to_utf8(line, 6) == {:ok, 6}
+      assert to_utf8(line, 7) == {:error, :misaligned}
       # after the guitar character
       assert to_utf8(line, 8) == {:ok, 10}
       assert to_utf8(line, 9) == {:ok, 11}
@@ -114,24 +117,27 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
 
   describe "converting to utf16" do
     test "respects bounds" do
-      assert {:error, :out_of_bounds} = to_utf16("h", 1)
+      assert {:error, :out_of_bounds} = to_utf16("h", 2)
     end
 
     test "with a multi-byte character" do
       line = "🏳️‍🌈"
+
       code_unit_count = count_utf16_code_units(line)
       utf8_code_unit_count = count_utf8_code_units(line)
 
-      assert to_utf16(line, 0) == {:error, :misaligned}
+      assert to_utf16(line, 0) == {:ok, 0}
       assert to_utf16(line, 1) == {:error, :misaligned}
       assert to_utf16(line, 2) == {:error, :misaligned}
-      assert to_utf16(line, 3) == {:ok, 1}
-      assert to_utf16(line, 4) == {:error, :misaligned}
-      assert to_utf16(line, utf8_code_unit_count - 1) == {:ok, code_unit_count - 1}
+      assert to_utf16(line, 3) == {:error, :misaligned}
+      assert to_utf16(line, 4) == {:ok, 2}
+      assert to_utf16(line, utf8_code_unit_count - 1) == {:error, :misaligned}
+      assert to_utf16(line, utf8_code_unit_count) == {:ok, code_unit_count}
     end
 
     test "after a multi-byte character" do
       line = "    {\"🎸\",   \"ok\"}"
+
       utf16_code_unit_count = count_utf16_code_units(line)
       utf8_code_unit_count = count_utf8_code_units(line)
 
@@ -140,11 +146,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
         assert to_utf16(line, index) == {:ok, index}
       end
 
-      assert to_utf16(line, 6) == {:error, :misaligned}
+      assert to_utf16(line, 6) == {:ok, 6}
       assert to_utf16(line, 7) == {:error, :misaligned}
       assert to_utf16(line, 8) == {:error, :misaligned}
+      assert to_utf16(line, 9) == {:error, :misaligned}
 
-      for index <- 9..17 do
+      for index <- 10..19 do
         assert to_utf16(line, index) == {:ok, index - 2}
       end
 
@@ -157,11 +164,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
       utf8_code_unit_count = count_utf8_code_units(s)
       utf16_unit_count = count_utf16_code_units(s)
 
-      assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count - 1)
-      assert utf16_unit == utf16_unit_count - 1
+      assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count)
+      assert utf16_unit == utf16_unit_count
 
       assert {:ok, utf8_unit} = to_utf8(s, utf16_unit)
-      assert utf8_unit == utf8_code_unit_count - 1
+      assert utf8_unit == utf8_code_unit_count
     end
   end
 
@@ -170,11 +177,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
       utf16_code_unit_count = count_utf16_code_units(s)
       utf8_code_unit_count = count_utf8_code_units(s)
 
-      assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count - 1)
-      assert utf8_code_unit == utf8_code_unit_count - 1
+      assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count)
+      assert utf8_code_unit == utf8_code_unit_count
 
       assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit)
-      assert utf16_unit == utf16_code_unit_count - 1
+      assert utf16_unit == utf16_code_unit_count
     end
   end