Skip to content

Commit

Permalink
feat: make frontend normalize line endings to LF (#3903)
Browse files Browse the repository at this point in the history
To eliminate parsing differences between Windows and other platforms,
the frontend now normalizes all CRLF line endings to LF, like [in
Rust](rust-lang/rust#62865).

Effects:
- This makes Lake hashes be faithful to what Lean sees (Lake already
normalizes line endings before computing hashes).
- Docstrings now have normalized line endings. In particular, this fixes
`#guard_msgs` failing multiline tests for Windows users using CRLF.
- Now strings don't have different lengths depending on the platform.
Before this PR, the following theorem is true for LF and false for CRLF
files.
```lean
example : "
".length = 1 := rfl
```

Note: the normalization will take `\r\r\n` and turn it into `\r\n`. In
the elaborator, we reject loose `\r`'s that appear in whitespace. Rust
instead takes the approach of making the normalization routine fail.
They do this so that there's no downstream confusion about any `\r\n`
that appears.

Implementation note: the LSP maintains its own copy of a source file
that it updates when edit operations are applied. We are assuming that
edit operations never split or join CRLFs. If this assumption is not
correct, then the LSP copy of a source file can become slightly out of
sync. If this is an issue, there is some discussion
[here](#3903 (comment)).
  • Loading branch information
kmill authored May 20, 2024
1 parent b278f9d commit a7338c5
Show file tree
Hide file tree
Showing 13 changed files with 137 additions and 88 deletions.
31 changes: 31 additions & 0 deletions src/Init/Data/String/Extra.lean
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
let n := findLeadingSpacesSize s
if n == 0 then s else removeNumLeadingSpaces n s

/--
Replaces each `\r\n` with `\n` to normalize line endings,
but does not validate that there are no isolated `\r` characters.
It is an optimized version of `String.replace text "\r\n" "\n"`.
-/
def crlfToLf (text : String) : String :=
go "" 0 0
where
go (acc : String) (accStop pos : String.Pos) : String :=
if h : text.atEnd pos then
-- note: if accStop = 0 then acc is empty
if accStop = 0 then text else acc ++ text.extract accStop pos
else
let c := text.get' pos h
let pos' := text.next' pos h
if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
let acc := acc ++ text.extract accStop pos
go acc pos' (text.next' pos' h'.1)
else
go acc accStop pos'
termination_by text.utf8ByteSize - pos.byteIdx
decreasing_by
decreasing_with
show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
decreasing_with
show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (String.lt_next _ _)

end String
2 changes: 1 addition & 1 deletion src/Lean/Data/Position.lean
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=

/--
Returns the position of the start of (1-based) line `line`.
This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
-/
def lineStart (map : FileMap) (line : Nat) : String.Pos :=
if h : line - 1 < map.positions.size then
Expand Down
17 changes: 7 additions & 10 deletions src/Lean/Parser/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
let curr := input.get' i h
if curr == '\t' then
s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
else if curr == '\r' then
s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
else if curr.isWhitespace then whitespace c (s.next' input i h)
else if curr == '-' then
let i := input.next' i h
Expand Down Expand Up @@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
/--
Parses the whitespace after the `\` when there is a string gap.
Raises an error if the whitespace does not contain exactly one newline character.
Processes `\r\n` as a newline.
-/
partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
let i := s.pos
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
else
Expand All @@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
-- Having more than one newline in a string gap is visually confusing
s.mkUnexpectedError "unexpected additional newline in string gap"
else
stringGapFn true false c (s.next' c.input i h)
else if curr == '\r' then
stringGapFn seenNewline true c (s.next' c.input i h)
else if afterCR then
s.mkUnexpectedError "expecting newline after carriage return"
stringGapFn true c (s.next' c.input i h)
else if curr.isWhitespace then
stringGapFn seenNewline false c (s.next' c.input i h)
stringGapFn seenNewline c (s.next' c.input i h)
else if seenNewline then
s
else
Expand All @@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
else if curr == 'u' then
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
else if inString && (curr == '\n' || curr == '\r') then
stringGapFn false false c s
else if inString && curr == '\n' then
stringGapFn false c s
else
s.mkUnexpectedError "invalid escape sequence"

Expand Down
13 changes: 7 additions & 6 deletions src/Lean/Parser/Extension.lean
Original file line number Diff line number Diff line change
Expand Up @@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
def getTokenTable (env : Environment) : TokenTable :=
(parserExtension.getState env).tokens

def mkInputContext (input : String) (fileName : String) : InputContext := {
input := input,
fileName := fileName,
fileMap := input.toFileMap
}
-- Note: `crlfToLf` preserves logical line and column numbers for each character.
def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
let input' := if normalizeLineEndings then input.crlfToLf else input
{ input := input',
fileName := fileName,
fileMap := input'.toFileMap }

def mkParserState (input : String) : ParserState :=
{ cache := initCacheForInput input }
Expand All @@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
if !s.allErrors.isEmpty then
Except.error (s.toErrorMsg ictx)
else if input.atEnd s.pos then
else if ictx.input.atEnd s.pos then
Except.ok s.stxStack.back
else
Except.error ((s.mkError "end of input").toErrorMsg ictx)
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/FileWorker.lean
Original file line number Diff line number Diff line change
Expand Up @@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
let initParams ← i.readLspRequestAs "initialize" InitializeParams
let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
let doc := param.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
let e := e.withPrefix s!"[{param.textDocument.uri}] "
let _ ← IO.setStderr e
let (ctx, st) ← try
Expand Down
13 changes: 10 additions & 3 deletions src/Lean/Server/Utils.lean
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ structure DocumentMeta where
uri : Lsp.DocumentUri
/-- Version number of the document. Incremented whenever the document is edited. -/
version : Nat
/-- Current text of the document. -/
/--
Current text of the document.
It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
(Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
text : FileMap
/--
Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
Expand All @@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
let «end» := text.lspPosToUtf8Pos r.«end»
let pre := text.source.extract 0 start
let post := text.source.extract «end» text.source.endPos
(pre ++ newText ++ post).toFileMap
-- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
-- Note: this assumes that editing never separates a `\r\n`.
-- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
-- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
(pre ++ newText.crlfToLf ++ post).toFileMap

open IO

Expand All @@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
| TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
replaceLspRange oldText range newText
| TextDocumentContentChangeEvent.fullChange (newText : String) =>
newText.toFileMap
newText.crlfToLf.toFileMap

/-- Returns the document contents with all changes applied. -/
def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/Watchdog.lean
Original file line number Diff line number Diff line change
Expand Up @@ -728,12 +728,9 @@ end RequestHandling
section NotificationHandling
def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
let doc := p.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩

def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
let doc := p.textDocument
Expand Down
3 changes: 1 addition & 2 deletions src/lake/Lake/Build/Trace.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Mac Malone
-/
import Lake.Util.Newline

open System
namespace Lake
Expand Down Expand Up @@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩

def computeTextFileHash (file : FilePath) : IO Hash := do
let text ← IO.FS.readFile file
let text := crlf2lf text
let text := text.crlfToLf
return Hash.ofString text

/--
Expand Down
24 changes: 0 additions & 24 deletions src/lake/Lake/Util/Newline.lean

This file was deleted.

3 changes: 1 addition & 2 deletions src/lake/tests/toml/Test.lean
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Authors: Mac Malone
-/
import Lake.Toml
import Lake.Util.Message
import Lake.Util.Newline

/-!
## TOML Test Runner
Expand All @@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
| .ok bytes =>
if let some input := String.fromUTF8? bytes then
pure (crlf2lf input)
pure input.crlfToLf
else
return .fail <| MessageLog.empty.add
{fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}
Expand Down
39 changes: 39 additions & 0 deletions tests/lean/run/crlfToLf.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/-!
# Test `String.crlfToLf`
-/

/-!
Leaves single `\n`'s alone.
-/
/-- info: "hello\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\nworld"

/-!
Turns `\r\n` into `\n`.
-/
/-- info: "hello\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld"

/-!
In a string of `\r...\r\n`, only normalizes the last `\r\n`.
-/
/-- info: "hello\x0d\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"

/-!
Two in a row.
-/
/-- info: "hello\n\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"

/-!
Normalizes at the end.
-/
/-- info: "hello\nworld\n" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"

/-!
Can handle a loose `\r` as the last character.
-/
/-- info: "hello\nworld\x0d" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"
Loading

0 comments on commit a7338c5

Please sign in to comment.