From a0c7fd9c12154f7cbccf7f7dd81fd811c9be1c9d Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Wed, 30 Nov 2022 20:21:11 -0800 Subject: [PATCH] Fixed off-by-one error that was vexing code unit conversions. The problem was that the character positions are _before_ the reported unit, so the 0th code unit is before the start of the line and the 1st code unit is the first character. The prior code added one to character counts to smooth this out, but you can't do that, because you could end up indexing into the middle of a multibyte character. --- .../language_server/experimental/code_unit.ex | 8 +-- .../experimental/source_file/conversions.ex | 54 +++++++------------ .../test/experimental/code_unit_test.exs | 51 ++++++++++-------- 3 files changed, 53 insertions(+), 60 deletions(-) diff --git a/apps/language_server/lib/language_server/experimental/code_unit.ex b/apps/language_server/lib/language_server/experimental/code_unit.ex index 14ad0b822..96ab4a658 100644 --- a/apps/language_server/lib/language_server/experimental/code_unit.ex +++ b/apps/language_server/lib/language_server/experimental/code_unit.ex @@ -44,12 +44,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do @spec to_utf8(String.t(), utf16_code_unit()) :: {:ok, utf8_code_unit()} | error def to_utf8(binary, utf16_unit) do - do_to_utf8(binary, utf16_unit + 1, 0) + do_to_utf8(binary, utf16_unit, 0) end @spec to_utf16(String.t(), utf8_code_unit()) :: {:ok, utf16_code_unit()} | error def to_utf16(binary, utf16_unit) do - do_to_utf16(binary, utf16_unit + 1, 0) + do_to_utf16(binary, utf16_unit, 0) end def count(:utf16, binary) do @@ -98,7 +98,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do end defp do_to_utf16(_, 0, utf16_unit) do - {:ok, utf16_unit - 1} + {:ok, utf16_unit} end defp do_to_utf16(_, utf8_unit, _) when utf8_unit < 0 do @@ -152,7 +152,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do end defp do_to_utf8(_, 0, utf8_unit) do - {:ok, utf8_unit - 1} + {:ok, utf8_unit} end defp do_to_utf8(_, utf_16_units, _) when utf_16_units < 0 do diff --git a/apps/language_server/lib/language_server/experimental/source_file/conversions.ex b/apps/language_server/lib/language_server/experimental/source_file/conversions.ex index 8a9f424e8..c019b74fb 100644 --- a/apps/language_server/lib/language_server/experimental/source_file/conversions.ex +++ b/apps/language_server/lib/language_server/experimental/source_file/conversions.ex @@ -59,10 +59,6 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do position end - def to_elixir(%LSPosition{} = position, %SourceFile{} = source_file) do - to_elixir(position, source_file.document) - end - def to_elixir(%LSPosition{} = position, %Document{} = document) do document_size = Document.size(document) # we need to handle out of bounds line numbers, because it's possible to build a document @@ -85,17 +81,8 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do {:ok, ElixirPosition.new(elixir_line_number, 0)} true -> - with {:ok, line} <- Document.fetch_line(document, elixir_line_number) do - elixir_character = - case line do - line(ascii?: true, text: text) -> - min(ls_character, byte_size(text)) - - line(text: text) -> - {:ok, utf16_text} = to_utf16(text) - lsp_character_to_elixir(utf16_text, ls_character) - end - + with {:ok, line} <- Document.fetch_line(document, elixir_line_number), + {:ok, elixir_character} <- extract_elixir_character(position, line) do {:ok, ElixirPosition.new(elixir_line_number, elixir_character)} end end @@ -127,20 +114,11 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do end def to_lsp(%ElixirPosition{} = position, %Document{} = document) do - %ElixirPosition{character: elixir_character, line: elixir_line} = position + with {:ok, line} <- Document.fetch_line(document, position.line), + {:ok, lsp_character} <- extract_lsp_character(position, line) do + ls_pos = + LSPosition.new(character: lsp_character, line: position.line - @elixir_ls_index_base) - with {:ok, line} <- Document.fetch_line(document, elixir_line) do - lsp_character = - case line do - line(ascii?: true, text: text) -> - min(position.character, byte_size(text)) - - line(text: utf8_text) -> - {:ok, character} = elixir_character_to_lsp(utf8_text, elixir_character) - character - end - - ls_pos = LSPosition.new(character: lsp_character, line: elixir_line - @elixir_ls_index_base) {:ok, ls_pos} end end @@ -151,19 +129,27 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do # Private - defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true)) do - {:ok, position.character} + defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true, text: text)) do + character = min(position.character, byte_size(text)) + {:ok, character} end defp extract_lsp_character(%ElixirPosition{} = position, line(text: utf8_text)) do - {:ok, CodeUnit.utf16_offset(utf8_text, position.character)} + with {:ok, code_unit} <- CodeUnit.to_utf16(utf8_text, position.character) do + character = min(code_unit, CodeUnit.count(:utf16, utf8_text)) + {:ok, character} + end end - defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true)) do - {:ok, position.character} + defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true, text: text)) do + character = min(position.character, byte_size(text)) + {:ok, character} end defp extract_elixir_character(%LSPosition{} = position, line(text: utf8_text)) do - {:ok, CodeUnit.utf8_offset(utf8_text, position.character)} + with {:ok, code_unit} <- CodeUnit.to_utf8(utf8_text, position.character) do + character = min(code_unit, byte_size(utf8_text)) + {:ok, character} + end end end diff --git a/apps/language_server/test/experimental/code_unit_test.exs b/apps/language_server/test/experimental/code_unit_test.exs index 69b5ac985..06b602065 100644 --- a/apps/language_server/test/experimental/code_unit_test.exs +++ b/apps/language_server/test/experimental/code_unit_test.exs @@ -64,6 +64,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do end test "handles multi-byte characters properly" do + # guitar is 2 code units in utf16 but 4 in utf8 line = "b🎸abc" assert 0 == utf16_offset(line, 0) assert 1 == utf16_offset(line, 1) @@ -77,19 +78,21 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do describe "converting to utf8" do test "bounds are respected" do - assert {:error, :out_of_bounds} = to_utf16("h", 1) + assert {:error, :out_of_bounds} = to_utf16("h", 2) end test "with a multi-byte character" do line = "🏳️‍🌈" + code_unit_count = count_utf8_code_units(line) - assert to_utf8(line, 0) == {:error, :misaligned} - assert to_utf8(line, 1) == {:ok, 3} - assert to_utf8(line, 2) == {:ok, 6} - assert to_utf8(line, 3) == {:ok, 9} - assert to_utf8(line, 4) == {:error, :misaligned} - assert to_utf8(line, 5) == {:ok, code_unit_count - 1} + assert to_utf8(line, 0) == {:ok, 0} + assert to_utf8(line, 1) == {:error, :misaligned} + assert to_utf8(line, 2) == {:ok, 4} + assert to_utf8(line, 3) == {:ok, 7} + assert to_utf8(line, 4) == {:ok, 10} + assert to_utf8(line, 5) == {:error, :misaligned} + assert to_utf8(line, 6) == {:ok, code_unit_count} end test "after a unicode character" do @@ -99,8 +102,8 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do assert to_utf8(line, 1) == {:ok, 1} assert to_utf8(line, 4) == {:ok, 4} assert to_utf8(line, 5) == {:ok, 5} - assert to_utf8(line, 6) == {:error, :misaligned} - assert to_utf8(line, 7) == {:ok, 9} + assert to_utf8(line, 6) == {:ok, 6} + assert to_utf8(line, 7) == {:error, :misaligned} # after the guitar character assert to_utf8(line, 8) == {:ok, 10} assert to_utf8(line, 9) == {:ok, 11} @@ -114,24 +117,27 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do describe "converting to utf16" do test "respects bounds" do - assert {:error, :out_of_bounds} = to_utf16("h", 1) + assert {:error, :out_of_bounds} = to_utf16("h", 2) end test "with a multi-byte character" do line = "🏳️‍🌈" + code_unit_count = count_utf16_code_units(line) utf8_code_unit_count = count_utf8_code_units(line) - assert to_utf16(line, 0) == {:error, :misaligned} + assert to_utf16(line, 0) == {:ok, 0} assert to_utf16(line, 1) == {:error, :misaligned} assert to_utf16(line, 2) == {:error, :misaligned} - assert to_utf16(line, 3) == {:ok, 1} - assert to_utf16(line, 4) == {:error, :misaligned} - assert to_utf16(line, utf8_code_unit_count - 1) == {:ok, code_unit_count - 1} + assert to_utf16(line, 3) == {:error, :misaligned} + assert to_utf16(line, 4) == {:ok, 2} + assert to_utf16(line, utf8_code_unit_count - 1) == {:error, :misaligned} + assert to_utf16(line, utf8_code_unit_count) == {:ok, code_unit_count} end test "after a multi-byte character" do line = " {\"🎸\", \"ok\"}" + utf16_code_unit_count = count_utf16_code_units(line) utf8_code_unit_count = count_utf8_code_units(line) @@ -140,11 +146,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do assert to_utf16(line, index) == {:ok, index} end - assert to_utf16(line, 6) == {:error, :misaligned} + assert to_utf16(line, 6) == {:ok, 6} assert to_utf16(line, 7) == {:error, :misaligned} assert to_utf16(line, 8) == {:error, :misaligned} + assert to_utf16(line, 9) == {:error, :misaligned} - for index <- 9..17 do + for index <- 10..19 do assert to_utf16(line, index) == {:ok, index - 2} end @@ -157,11 +164,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do utf8_code_unit_count = count_utf8_code_units(s) utf16_unit_count = count_utf16_code_units(s) - assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count - 1) - assert utf16_unit == utf16_unit_count - 1 + assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count) + assert utf16_unit == utf16_unit_count assert {:ok, utf8_unit} = to_utf8(s, utf16_unit) - assert utf8_unit == utf8_code_unit_count - 1 + assert utf8_unit == utf8_code_unit_count end end @@ -170,11 +177,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do utf16_code_unit_count = count_utf16_code_units(s) utf8_code_unit_count = count_utf8_code_units(s) - assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count - 1) - assert utf8_code_unit == utf8_code_unit_count - 1 + assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count) + assert utf8_code_unit == utf8_code_unit_count assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit) - assert utf16_unit == utf16_code_unit_count - 1 + assert utf16_unit == utf16_code_unit_count end end