From 206ed215ebc5e897f6c91cf50b1823ce2728d06f Mon Sep 17 00:00:00 2001
From: Lukasz Samson <lukaszsamson@gmail.com>
Date: Fri, 13 Oct 2023 21:25:04 +0200
Subject: [PATCH] Make position translation to UTF16 more robust

When diagnostics were emitted for a previous version of the file the positions may no longer be valid
Fixes https://github.com/elixir-lsp/elixir-ls/issues/1002
---
 .../lib/language_server/diagnostics.ex        | 75 +++++++++++--------
 .../lib/language_server/source_file.ex        | 59 ++++++++++-----
 apps/language_server/test/dialyzer_test.exs   | 20 ++---
 apps/language_server/test/server_test.exs     |  2 +-
 .../language_server/test/source_file_test.exs | 40 ++++++++--
 5 files changed, 127 insertions(+), 69 deletions(-)

diff --git a/apps/language_server/lib/language_server/diagnostics.ex b/apps/language_server/lib/language_server/diagnostics.ex
index 5e94f61c8..abe141a8c 100644
--- a/apps/language_server/lib/language_server/diagnostics.ex
+++ b/apps/language_server/lib/language_server/diagnostics.ex
@@ -291,24 +291,37 @@ defmodule ElixirLS.LanguageServer.Diagnostics do
   # https://microsoft.github.io/language-server-protocol/specifications/specification-3-16/#diagnostic
 
   # position is a 1 based line number
-  # we return a range of trimmed text in that line
-  defp range(position, source_file)
-       when is_integer(position) and position >= 1 and not is_nil(source_file) do
+  # we return a 0 length range at first non whitespace character in line
+  defp range(line_start, source_file)
+       when is_integer(line_start) and not is_nil(source_file) do
     # line is 1 based
-    line = position - 1
-    text = Enum.at(SourceFile.lines(source_file), line) || ""
+    lines = SourceFile.lines(source_file)
 
-    start_idx = String.length(text) - String.length(String.trim_leading(text)) + 1
-    length = max(String.length(String.trim(text)), 1)
+    {line_start_lsp, char_start_lsp} =
+      if line_start > 0 do
+        case Enum.at(lines, line_start - 1) do
+          nil ->
+            # position is outside file range - this will return end of the file
+            SourceFile.elixir_position_to_lsp(lines, {line_start, 1})
+
+          line ->
+            # find first non whitespace character in line
+            start_idx = String.length(line) - String.length(String.trim_leading(line)) + 1
+            {line_start - 1, SourceFile.elixir_character_to_lsp(line, start_idx)}
+        end
+      else
+        # return begin of the file
+        {0, 0}
+      end
 
     %{
       "start" => %{
-        "line" => line,
-        "character" => SourceFile.elixir_character_to_lsp(text, start_idx)
+        "line" => line_start_lsp,
+        "character" => char_start_lsp
       },
       "end" => %{
-        "line" => line,
-        "character" => SourceFile.elixir_character_to_lsp(text, start_idx + length)
+        "line" => line_start_lsp,
+        "character" => char_start_lsp
       }
     }
   end
@@ -316,21 +329,20 @@ defmodule ElixirLS.LanguageServer.Diagnostics do
   # position is a 1 based line number and 0 based character cursor (UTF8)
   # we return a 0 length range exactly at that location
   defp range({line_start, char_start}, source_file)
-       when line_start >= 1 and not is_nil(source_file) do
+       when not is_nil(source_file) do
     lines = SourceFile.lines(source_file)
-    # line is 1 based
-    start_line = Enum.at(lines, line_start - 1)
-    # SourceFile.elixir_character_to_lsp assumes char to be 1 based but it's 0 based here
-    character = SourceFile.elixir_character_to_lsp(start_line, char_start + 1)
+    # elixir_position_to_lsp will handle positions outside file range
+    {line_start_lsp, char_start_lsp} =
+      SourceFile.elixir_position_to_lsp(lines, {line_start, char_start - 1})
 
     %{
       "start" => %{
-        "line" => line_start - 1,
-        "character" => character
+        "line" => line_start_lsp,
+        "character" => char_start_lsp
       },
       "end" => %{
-        "line" => line_start - 1,
-        "character" => character
+        "line" => line_start_lsp,
+        "character" => char_start_lsp
       }
     }
   end
@@ -338,29 +350,28 @@ defmodule ElixirLS.LanguageServer.Diagnostics do
   # position is a range defined by 1 based line numbers and 0 based character cursors (UTF8)
   # we return exactly that range
   defp range({line_start, char_start, line_end, char_end}, source_file)
-       when line_start >= 1 and line_end >= 1 and not is_nil(source_file) do
+       when not is_nil(source_file) do
     lines = SourceFile.lines(source_file)
-    # line is 1 based
-    start_line = Enum.at(lines, line_start - 1)
-    end_line = Enum.at(lines, line_end - 1)
+    # elixir_position_to_lsp will handle positions outside file range
+    {line_start_lsp, char_start_lsp} =
+      SourceFile.elixir_position_to_lsp(lines, {line_start, char_start - 1})
 
-    # SourceFile.elixir_character_to_lsp assumes char to be 1 based but it's 0 based here
-    start_char = SourceFile.elixir_character_to_lsp(start_line, char_start + 1)
-    end_char = SourceFile.elixir_character_to_lsp(end_line, char_end + 1)
+    {line_end_lsp, char_end_lsp} =
+      SourceFile.elixir_position_to_lsp(lines, {line_end, char_end - 1})
 
     %{
       "start" => %{
-        "line" => line_start - 1,
-        "character" => start_char
+        "line" => line_start_lsp,
+        "character" => char_start_lsp
       },
       "end" => %{
-        "line" => line_end - 1,
-        "character" => end_char
+        "line" => line_end_lsp,
+        "character" => char_end_lsp
       }
     }
   end
 
-  # source file is unknown, position is 0 or invalid
+  # source file is unknown
   # we discard any position information as it is meaningless
   # unfortunately LSP does not allow `null` range so we need to return something
   defp range(_, _) do
diff --git a/apps/language_server/lib/language_server/source_file.ex b/apps/language_server/lib/language_server/source_file.ex
index 04e7049d8..eab8cf708 100644
--- a/apps/language_server/lib/language_server/source_file.ex
+++ b/apps/language_server/lib/language_server/source_file.ex
@@ -286,24 +286,29 @@ defmodule ElixirLS.LanguageServer.SourceFile do
     utf8_character + 1
   end
 
-  def lsp_position_to_elixir(_urf8_text, {lsp_line, _lsp_character}) when lsp_line < 0,
+  def lsp_position_to_elixir(_urf8_text_or_lines, {lsp_line, _lsp_character}) when lsp_line < 0,
     do: {1, 1}
 
-  def lsp_position_to_elixir(_urf8_text, {lsp_line, lsp_character}) when lsp_character <= 0,
-    do: {max(lsp_line + 1, 1), 1}
+  def lsp_position_to_elixir(_urf8_text_or_lines, {lsp_line, lsp_character})
+      when lsp_character <= 0,
+      do: {max(lsp_line + 1, 1), 1}
 
-  def lsp_position_to_elixir(urf8_text, {lsp_line, lsp_character}) do
-    source_file_lines = lines(urf8_text)
-    total_lines = length(source_file_lines)
+  def lsp_position_to_elixir(urf8_text, {lsp_line, lsp_character}) when is_binary(urf8_text) do
+    lsp_position_to_elixir(lines(urf8_text), {lsp_line, lsp_character})
+  end
+
+  def lsp_position_to_elixir([_ | _] = urf8_lines, {lsp_line, lsp_character})
+      when lsp_line >= 0 do
+    total_lines = length(urf8_lines)
 
     if lsp_line > total_lines - 1 do
       # sanitize to position after last char in last line
-      {total_lines, String.length(source_file_lines |> Enum.at(total_lines - 1)) + 1}
+      last_line = Enum.at(urf8_lines, total_lines - 1)
+      elixir_last_character = String.length(last_line) + 1
+      {total_lines, elixir_last_character}
     else
-      utf8_character =
-        source_file_lines
-        |> Enum.at(lsp_line)
-        |> lsp_character_to_elixir(lsp_character)
+      line = Enum.at(urf8_lines, lsp_line)
+      utf8_character = lsp_character_to_elixir(line, lsp_character)
 
       {lsp_line + 1, utf8_character}
     end
@@ -319,17 +324,35 @@ defmodule ElixirLS.LanguageServer.SourceFile do
     |> div(2)
   end
 
-  def elixir_position_to_lsp(_urf8_text, {elixir_line, elixir_character})
+  def elixir_position_to_lsp(_urf8_text_or_lines, {elixir_line, _elixir_character})
+      when elixir_line < 1,
+      do: {0, 0}
+
+  def elixir_position_to_lsp(_urf8_text_or_lines, {elixir_line, elixir_character})
       when elixir_character <= 1,
       do: {max(elixir_line - 1, 0), 0}
 
-  def elixir_position_to_lsp(urf8_text, {elixir_line, elixir_character}) do
-    line =
-      lines(urf8_text)
-      |> Enum.at(max(elixir_line - 1, 0))
+  def elixir_position_to_lsp(urf8_text, {elixir_line, elixir_character})
+      when is_binary(urf8_text) do
+    elixir_position_to_lsp(lines(urf8_text), {elixir_line, elixir_character})
+  end
 
-    utf16_character = elixir_character_to_lsp(line || "", elixir_character)
+  def elixir_position_to_lsp([_ | _] = urf8_lines, {elixir_line, elixir_character})
+      when elixir_line >= 1 do
+    total_lines = length(urf8_lines)
 
-    {elixir_line - 1, utf16_character}
+    if elixir_line > total_lines do
+      # sanitize to position after last char in last line
+      last_line = Enum.at(urf8_lines, total_lines - 1)
+      elixir_last_character = String.length(last_line) + 1
+
+      utf16_character = elixir_character_to_lsp(last_line, elixir_last_character)
+      {total_lines - 1, utf16_character}
+    else
+      line = Enum.at(urf8_lines, max(elixir_line - 1, 0))
+      utf16_character = elixir_character_to_lsp(line, elixir_character)
+
+      {elixir_line - 1, utf16_character}
+    end
   end
 end
diff --git a/apps/language_server/test/dialyzer_test.exs b/apps/language_server/test/dialyzer_test.exs
index fe3052f92..8a8e8bf2c 100644
--- a/apps/language_server/test/dialyzer_test.exs
+++ b/apps/language_server/test/dialyzer_test.exs
@@ -59,7 +59,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message1,
                    "range" => %{
-                     "end" => %{"character" => 12, "line" => 1},
+                     "end" => %{"character" => 2, "line" => 1},
                      "start" => %{"character" => 2, "line" => 1}
                    },
                    "severity" => 2,
@@ -68,7 +68,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message2,
                    "range" => %{
-                     "end" => %{"character" => 17, "line" => 2},
+                     "end" => %{"character" => 4, "line" => 2},
                      "start" => %{"character" => 4, "line" => 2}
                    },
                    "severity" => 2,
@@ -171,7 +171,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message1,
                    "range" => %{
-                     "end" => %{"character" => 12, "line" => 1},
+                     "end" => %{"character" => 2, "line" => 1},
                      "start" => %{"character" => 2, "line" => 1}
                    },
                    "severity" => 2,
@@ -180,7 +180,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message2,
                    "range" => %{
-                     "end" => %{"character" => 17, "line" => 2},
+                     "end" => %{"character" => 4, "line" => 2},
                      "start" => %{"character" => 4, "line" => 2}
                    },
                    "severity" => 2,
@@ -219,7 +219,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message1,
                    "range" => %{
-                     "end" => %{"character" => 12, "line" => 1},
+                     "end" => %{"character" => 2, "line" => 1},
                      "start" => %{"character" => 2, "line" => 1}
                    },
                    "severity" => 2,
@@ -228,7 +228,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message2,
                    "range" => %{
-                     "end" => %{"character" => 17, "line" => 2},
+                     "end" => %{"character" => 4, "line" => 2},
                      "start" => %{"character" => 4, "line" => 2}
                    },
                    "severity" => 2,
@@ -257,7 +257,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message1,
                    "range" => %{
-                     "end" => %{"character" => 12, "line" => 1},
+                     "end" => %{"character" => 2, "line" => 1},
                      "start" => %{"character" => 2, "line" => 1}
                    },
                    "severity" => 2,
@@ -266,7 +266,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => _error_message2,
                    "range" => %{
-                     "end" => %{"character" => 17, "line" => 2},
+                     "end" => %{"character" => 4, "line" => 2},
                      "start" => %{"character" => 4, "line" => 2}
                    },
                    "severity" => 2,
@@ -296,7 +296,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message1,
                    "range" => %{
-                     "end" => %{"character" => 22, "line" => 1},
+                     "end" => %{"character" => 2, "line" => 1},
                      "start" => %{"character" => 2, "line" => 1}
                    },
                    "severity" => 2,
@@ -305,7 +305,7 @@ defmodule ElixirLS.LanguageServer.DialyzerTest do
                  %{
                    "message" => error_message2,
                    "range" => %{
-                     "end" => %{"character" => 22, "line" => 2},
+                     "end" => %{"character" => 4, "line" => 2},
                      "start" => %{"character" => 4, "line" => 2}
                    },
                    "severity" => 2,
diff --git a/apps/language_server/test/server_test.exs b/apps/language_server/test/server_test.exs
index f622e4cbc..52c86c1bc 100644
--- a/apps/language_server/test/server_test.exs
+++ b/apps/language_server/test/server_test.exs
@@ -1663,7 +1663,7 @@ defmodule ElixirLS.LanguageServer.ServerTest do
                            %{
                              "message" => "undefined function does_not_exist/0" <> _,
                              "range" => %{
-                               "end" => %{"character" => 20, "line" => 3},
+                               "end" => %{"character" => 4, "line" => 3},
                                "start" => %{"character" => 4, "line" => 3}
                              },
                              "severity" => 1,
diff --git a/apps/language_server/test/source_file_test.exs b/apps/language_server/test/source_file_test.exs
index 9d58f4c17..a564b06ca 100644
--- a/apps/language_server/test/source_file_test.exs
+++ b/apps/language_server/test/source_file_test.exs
@@ -655,6 +655,14 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
       assert {1, 2} == SourceFile.lsp_position_to_elixir("abcde", {0, 1})
     end
 
+    test "lsp_position_to_elixir single line utf8" do
+      assert {1, 2} == SourceFile.lsp_position_to_elixir("🏳️‍🌈abcde", {0, 6})
+    end
+
+    test "lsp_position_to_elixir multi line" do
+      assert {2, 2} == SourceFile.lsp_position_to_elixir("abcde\n1234", {1, 1})
+    end
+
     # This is not specified in LSP but some clients fail to synchronize text properly
     test "lsp_position_to_elixir single line before line start" do
       assert {1, 1} == SourceFile.lsp_position_to_elixir("abcde", {0, -1})
@@ -667,14 +675,6 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
       assert {1, 1} == SourceFile.lsp_position_to_elixir("", {0, 15})
     end
 
-    test "lsp_position_to_elixir single line utf8" do
-      assert {1, 2} == SourceFile.lsp_position_to_elixir("🏳️‍🌈abcde", {0, 6})
-    end
-
-    test "lsp_position_to_elixir multi line" do
-      assert {2, 2} == SourceFile.lsp_position_to_elixir("abcde\n1234", {1, 1})
-    end
-
     # This is not specified in LSP but some clients fail to synchronize text properly
     test "lsp_position_to_elixir multi line before first line" do
       assert {1, 1} == SourceFile.lsp_position_to_elixir("abcde\n1234", {-1, 2})
@@ -705,6 +705,30 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
       assert {1, 1} == SourceFile.elixir_position_to_lsp("abcde\n1234", {2, 2})
     end
 
+    # This is not specified in LSP but some clients fail to synchronize text properly
+    test "elixir_position_to_lsp single line before line start" do
+      assert {0, 0} == SourceFile.elixir_position_to_lsp("abcde", {1, -1})
+      assert {0, 0} == SourceFile.elixir_position_to_lsp("abcde", {1, 0})
+    end
+
+    # LSP spec 3.17 https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#position
+    # position character If the character value is greater than the line length it defaults back to the line length
+    test "elixir_position_to_lsp single line after line end" do
+      assert {0, 5} == SourceFile.elixir_position_to_lsp("abcde", {1, 15})
+      assert {0, 0} == SourceFile.elixir_position_to_lsp("", {1, 15})
+    end
+
+    # This is not specified in LSP but some clients fail to synchronize text properly
+    test "elixir_position_to_lsp multi line before first line" do
+      assert {0, 0} == SourceFile.elixir_position_to_lsp("abcde\n1234", {-1, 2})
+      assert {0, 0} == SourceFile.elixir_position_to_lsp("abcde\n1234", {0, 2})
+    end
+
+    # This is not specified in LSP but some clients fail to synchronize text properly
+    test "elixir_position_to_lsp multi line after last line" do
+      assert {1, 4} == SourceFile.elixir_position_to_lsp("abcde\n1234", {8, 2})
+    end
+
     test "sanity check" do
       text = "aąłsd🏳️‍🌈abcde"