Skip to content

Commit

Permalink
Fixed off-by-one error that was vexing code unit conversions.
Browse files Browse the repository at this point in the history
The problem was that the character positions are _before_ the reported
unit, so the 0th code unit is before the start of the line and the 1st
code unit is the first character. The prior code added one to
character counts to smooth this out, but you can't do that, because
you could end up indexing into the middle of a multibyte character.
  • Loading branch information
scohen committed Dec 1, 2022
1 parent 36f8556 commit a0c7fd9
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do

@spec to_utf8(String.t(), utf16_code_unit()) :: {:ok, utf8_code_unit()} | error
def to_utf8(binary, utf16_unit) do
do_to_utf8(binary, utf16_unit + 1, 0)
do_to_utf8(binary, utf16_unit, 0)
end

@spec to_utf16(String.t(), utf8_code_unit()) :: {:ok, utf16_code_unit()} | error
def to_utf16(binary, utf16_unit) do
do_to_utf16(binary, utf16_unit + 1, 0)
do_to_utf16(binary, utf16_unit, 0)
end

def count(:utf16, binary) do
Expand Down Expand Up @@ -98,7 +98,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do
end

defp do_to_utf16(_, 0, utf16_unit) do
{:ok, utf16_unit - 1}
{:ok, utf16_unit}
end

defp do_to_utf16(_, utf8_unit, _) when utf8_unit < 0 do
Expand Down Expand Up @@ -152,7 +152,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnit do
end

defp do_to_utf8(_, 0, utf8_unit) do
{:ok, utf8_unit - 1}
{:ok, utf8_unit}
end

defp do_to_utf8(_, utf_16_units, _) when utf_16_units < 0 do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,6 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
position
end

def to_elixir(%LSPosition{} = position, %SourceFile{} = source_file) do
to_elixir(position, source_file.document)
end

def to_elixir(%LSPosition{} = position, %Document{} = document) do
document_size = Document.size(document)
# we need to handle out of bounds line numbers, because it's possible to build a document
Expand All @@ -85,17 +81,8 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
{:ok, ElixirPosition.new(elixir_line_number, 0)}

true ->
with {:ok, line} <- Document.fetch_line(document, elixir_line_number) do
elixir_character =
case line do
line(ascii?: true, text: text) ->
min(ls_character, byte_size(text))

line(text: text) ->
{:ok, utf16_text} = to_utf16(text)
lsp_character_to_elixir(utf16_text, ls_character)
end

with {:ok, line} <- Document.fetch_line(document, elixir_line_number),
{:ok, elixir_character} <- extract_elixir_character(position, line) do
{:ok, ElixirPosition.new(elixir_line_number, elixir_character)}
end
end
Expand Down Expand Up @@ -127,20 +114,11 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do
end

def to_lsp(%ElixirPosition{} = position, %Document{} = document) do
%ElixirPosition{character: elixir_character, line: elixir_line} = position
with {:ok, line} <- Document.fetch_line(document, position.line),
{:ok, lsp_character} <- extract_lsp_character(position, line) do
ls_pos =
LSPosition.new(character: lsp_character, line: position.line - @elixir_ls_index_base)

with {:ok, line} <- Document.fetch_line(document, elixir_line) do
lsp_character =
case line do
line(ascii?: true, text: text) ->
min(position.character, byte_size(text))

line(text: utf8_text) ->
{:ok, character} = elixir_character_to_lsp(utf8_text, elixir_character)
character
end

ls_pos = LSPosition.new(character: lsp_character, line: elixir_line - @elixir_ls_index_base)
{:ok, ls_pos}
end
end
Expand All @@ -151,19 +129,27 @@ defmodule ElixirLS.LanguageServer.Experimental.SourceFile.Conversions do

# Private

defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true)) do
{:ok, position.character}
defp extract_lsp_character(%ElixirPosition{} = position, line(ascii?: true, text: text)) do
character = min(position.character, byte_size(text))
{:ok, character}
end

defp extract_lsp_character(%ElixirPosition{} = position, line(text: utf8_text)) do
{:ok, CodeUnit.utf16_offset(utf8_text, position.character)}
with {:ok, code_unit} <- CodeUnit.to_utf16(utf8_text, position.character) do
character = min(code_unit, CodeUnit.count(:utf16, utf8_text))
{:ok, character}
end
end

defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true)) do
{:ok, position.character}
defp extract_elixir_character(%LSPosition{} = position, line(ascii?: true, text: text)) do
character = min(position.character, byte_size(text))
{:ok, character}
end

defp extract_elixir_character(%LSPosition{} = position, line(text: utf8_text)) do
{:ok, CodeUnit.utf8_offset(utf8_text, position.character)}
with {:ok, code_unit} <- CodeUnit.to_utf8(utf8_text, position.character) do
character = min(code_unit, byte_size(utf8_text))
{:ok, character}
end
end
end
51 changes: 29 additions & 22 deletions apps/language_server/test/experimental/code_unit_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
end

test "handles multi-byte characters properly" do
# guitar is 2 code units in utf16 but 4 in utf8
line = "b🎸abc"
assert 0 == utf16_offset(line, 0)
assert 1 == utf16_offset(line, 1)
Expand All @@ -77,19 +78,21 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do

describe "converting to utf8" do
test "bounds are respected" do
assert {:error, :out_of_bounds} = to_utf16("h", 1)
assert {:error, :out_of_bounds} = to_utf16("h", 2)
end

test "with a multi-byte character" do
line = "🏳️‍🌈"

code_unit_count = count_utf8_code_units(line)

assert to_utf8(line, 0) == {:error, :misaligned}
assert to_utf8(line, 1) == {:ok, 3}
assert to_utf8(line, 2) == {:ok, 6}
assert to_utf8(line, 3) == {:ok, 9}
assert to_utf8(line, 4) == {:error, :misaligned}
assert to_utf8(line, 5) == {:ok, code_unit_count - 1}
assert to_utf8(line, 0) == {:ok, 0}
assert to_utf8(line, 1) == {:error, :misaligned}
assert to_utf8(line, 2) == {:ok, 4}
assert to_utf8(line, 3) == {:ok, 7}
assert to_utf8(line, 4) == {:ok, 10}
assert to_utf8(line, 5) == {:error, :misaligned}
assert to_utf8(line, 6) == {:ok, code_unit_count}
end

test "after a unicode character" do
Expand All @@ -99,8 +102,8 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
assert to_utf8(line, 1) == {:ok, 1}
assert to_utf8(line, 4) == {:ok, 4}
assert to_utf8(line, 5) == {:ok, 5}
assert to_utf8(line, 6) == {:error, :misaligned}
assert to_utf8(line, 7) == {:ok, 9}
assert to_utf8(line, 6) == {:ok, 6}
assert to_utf8(line, 7) == {:error, :misaligned}
# after the guitar character
assert to_utf8(line, 8) == {:ok, 10}
assert to_utf8(line, 9) == {:ok, 11}
Expand All @@ -114,24 +117,27 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do

describe "converting to utf16" do
test "respects bounds" do
assert {:error, :out_of_bounds} = to_utf16("h", 1)
assert {:error, :out_of_bounds} = to_utf16("h", 2)
end

test "with a multi-byte character" do
line = "🏳️‍🌈"

code_unit_count = count_utf16_code_units(line)
utf8_code_unit_count = count_utf8_code_units(line)

assert to_utf16(line, 0) == {:error, :misaligned}
assert to_utf16(line, 0) == {:ok, 0}
assert to_utf16(line, 1) == {:error, :misaligned}
assert to_utf16(line, 2) == {:error, :misaligned}
assert to_utf16(line, 3) == {:ok, 1}
assert to_utf16(line, 4) == {:error, :misaligned}
assert to_utf16(line, utf8_code_unit_count - 1) == {:ok, code_unit_count - 1}
assert to_utf16(line, 3) == {:error, :misaligned}
assert to_utf16(line, 4) == {:ok, 2}
assert to_utf16(line, utf8_code_unit_count - 1) == {:error, :misaligned}
assert to_utf16(line, utf8_code_unit_count) == {:ok, code_unit_count}
end

test "after a multi-byte character" do
line = " {\"🎸\", \"ok\"}"

utf16_code_unit_count = count_utf16_code_units(line)
utf8_code_unit_count = count_utf8_code_units(line)

Expand All @@ -140,11 +146,12 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
assert to_utf16(line, index) == {:ok, index}
end

assert to_utf16(line, 6) == {:error, :misaligned}
assert to_utf16(line, 6) == {:ok, 6}
assert to_utf16(line, 7) == {:error, :misaligned}
assert to_utf16(line, 8) == {:error, :misaligned}
assert to_utf16(line, 9) == {:error, :misaligned}

for index <- 9..17 do
for index <- 10..19 do
assert to_utf16(line, index) == {:ok, index - 2}
end

Expand All @@ -157,11 +164,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
utf8_code_unit_count = count_utf8_code_units(s)
utf16_unit_count = count_utf16_code_units(s)

assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count - 1)
assert utf16_unit == utf16_unit_count - 1
assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit_count)
assert utf16_unit == utf16_unit_count

assert {:ok, utf8_unit} = to_utf8(s, utf16_unit)
assert utf8_unit == utf8_code_unit_count - 1
assert utf8_unit == utf8_code_unit_count
end
end

Expand All @@ -170,11 +177,11 @@ defmodule ElixirLS.LanguageServer.Experimental.CodeUnitTest do
utf16_code_unit_count = count_utf16_code_units(s)
utf8_code_unit_count = count_utf8_code_units(s)

assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count - 1)
assert utf8_code_unit == utf8_code_unit_count - 1
assert {:ok, utf8_code_unit} = to_utf8(s, utf16_code_unit_count)
assert utf8_code_unit == utf8_code_unit_count

assert {:ok, utf16_unit} = to_utf16(s, utf8_code_unit)
assert utf16_unit == utf16_code_unit_count - 1
assert utf16_unit == utf16_code_unit_count
end
end

Expand Down

0 comments on commit a0c7fd9

Please sign in to comment.