diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean
index 2739c707d678..2cc70b3a3270 100644
--- a/src/Init/Data/String/Extra.lean
+++ b/src/Init/Data/String/Extra.lean
@@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
let n := findLeadingSpacesSize s
if n == 0 then s else removeNumLeadingSpaces n s
+/--
+Replaces each `\r\n` with `\n` to normalize line endings,
+but does not validate that there are no isolated `\r` characters.
+It is an optimized version of `String.replace text "\r\n" "\n"`.
+-/
+def crlfToLf (text : String) : String :=
+ go "" 0 0
+where
+ go (acc : String) (accStop pos : String.Pos) : String :=
+ if h : text.atEnd pos then
+ -- note: if accStop = 0 then acc is empty
+ if accStop = 0 then text else acc ++ text.extract accStop pos
+ else
+ let c := text.get' pos h
+ let pos' := text.next' pos h
+ if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
+ let acc := acc ++ text.extract accStop pos
+ go acc pos' (text.next' pos' h'.1)
+ else
+ go acc accStop pos'
+ termination_by text.utf8ByteSize - pos.byteIdx
+ decreasing_by
+ decreasing_with
+ show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
+ have k := Nat.gt_of_not_le <| mt decide_eq_true h
+ exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
+ decreasing_with
+ show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
+ have k := Nat.gt_of_not_le <| mt decide_eq_true h
+ exact Nat.sub_lt_sub_left k (String.lt_next _ _)
+
end String
diff --git a/src/Lean/Data/Position.lean b/src/Lean/Data/Position.lean
index 2af14fc623f4..7495b03f5d86 100644
--- a/src/Lean/Data/Position.lean
+++ b/src/Lean/Data/Position.lean
@@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=
/--
Returns the position of the start of (1-based) line `line`.
-This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
+This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
-/
def lineStart (map : FileMap) (line : Nat) : String.Pos :=
if h : line - 1 < map.positions.size then
diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean
index 7ba45f62a652..253afbaefbbd 100644
--- a/src/Lean/Parser/Basic.lean
+++ b/src/Lean/Parser/Basic.lean
@@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
let curr := input.get' i h
if curr == '\t' then
s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
+ else if curr == '\r' then
+ s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
else if curr.isWhitespace then whitespace c (s.next' input i h)
else if curr == '-' then
let i := input.next' i h
@@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
/--
Parses the whitespace after the `\` when there is a string gap.
Raises an error if the whitespace does not contain exactly one newline character.
-Processes `\r\n` as a newline.
-/
-partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
+partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
let i := s.pos
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
else
@@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
-- Having more than one newline in a string gap is visually confusing
s.mkUnexpectedError "unexpected additional newline in string gap"
else
- stringGapFn true false c (s.next' c.input i h)
- else if curr == '\r' then
- stringGapFn seenNewline true c (s.next' c.input i h)
- else if afterCR then
- s.mkUnexpectedError "expecting newline after carriage return"
+ stringGapFn true c (s.next' c.input i h)
else if curr.isWhitespace then
- stringGapFn seenNewline false c (s.next' c.input i h)
+ stringGapFn seenNewline c (s.next' c.input i h)
else if seenNewline then
s
else
@@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
else if curr == 'u' then
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
- else if inString && (curr == '\n' || curr == '\r') then
- stringGapFn false false c s
+ else if inString && curr == '\n' then
+ stringGapFn false c s
else
s.mkUnexpectedError "invalid escape sequence"
diff --git a/src/Lean/Parser/Extension.lean b/src/Lean/Parser/Extension.lean
index 562900d894fd..03ee8095161c 100644
--- a/src/Lean/Parser/Extension.lean
+++ b/src/Lean/Parser/Extension.lean
@@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
def getTokenTable (env : Environment) : TokenTable :=
(parserExtension.getState env).tokens
-def mkInputContext (input : String) (fileName : String) : InputContext := {
- input := input,
- fileName := fileName,
- fileMap := input.toFileMap
-}
+-- Note: `crlfToLf` preserves logical line and column numbers for each character.
+def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
+ let input' := if normalizeLineEndings then input.crlfToLf else input
+ { input := input',
+ fileName := fileName,
+ fileMap := input'.toFileMap }
def mkParserState (input : String) : ParserState :=
{ cache := initCacheForInput input }
@@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
if !s.allErrors.isEmpty then
Except.error (s.toErrorMsg ictx)
- else if input.atEnd s.pos then
+ else if ictx.input.atEnd s.pos then
Except.ok s.stxStack.back
else
Except.error ((s.mkError "end of input").toErrorMsg ictx)
diff --git a/src/Lean/Server/FileWorker.lean b/src/Lean/Server/FileWorker.lean
index 22db77f42503..8fa0c365f2f6 100644
--- a/src/Lean/Server/FileWorker.lean
+++ b/src/Lean/Server/FileWorker.lean
@@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
let initParams ← i.readLspRequestAs "initialize" InitializeParams
let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
let doc := param.textDocument
- /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
- "\n", which should be enough to handle both LF and CRLF correctly.
- This is because LSP always refers to characters by (line, column),
- so if we get the line number correct it shouldn't matter that there
- is a CR there. -/
- let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
+ /- Note (kmill): LSP always refers to characters by (line, column),
+ so converting CRLF to LF preserves line and column numbers. -/
+ let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
let e := e.withPrefix s!"[{param.textDocument.uri}] "
let _ ← IO.setStderr e
let (ctx, st) ← try
diff --git a/src/Lean/Server/Utils.lean b/src/Lean/Server/Utils.lean
index b07428744e9c..01104108baf3 100644
--- a/src/Lean/Server/Utils.lean
+++ b/src/Lean/Server/Utils.lean
@@ -75,7 +75,10 @@ structure DocumentMeta where
uri : Lsp.DocumentUri
/-- Version number of the document. Incremented whenever the document is edited. -/
version : Nat
- /-- Current text of the document. -/
+ /--
+ Current text of the document.
+ It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
+ (Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
text : FileMap
/--
Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
@@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
let «end» := text.lspPosToUtf8Pos r.«end»
let pre := text.source.extract 0 start
let post := text.source.extract «end» text.source.endPos
- (pre ++ newText ++ post).toFileMap
+ -- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
+ -- Note: this assumes that editing never separates a `\r\n`.
+ -- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
+ -- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
+ (pre ++ newText.crlfToLf ++ post).toFileMap
open IO
@@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
| TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
replaceLspRange oldText range newText
| TextDocumentContentChangeEvent.fullChange (newText : String) =>
- newText.toFileMap
+ newText.crlfToLf.toFileMap
/-- Returns the document contents with all changes applied. -/
def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=
diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean
index 045d797ede2f..4d7c040759c6 100644
--- a/src/Lean/Server/Watchdog.lean
+++ b/src/Lean/Server/Watchdog.lean
@@ -728,12 +728,9 @@ end RequestHandling
section NotificationHandling
def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
let doc := p.textDocument
- /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
- "\n", which should be enough to handle both LF and CRLF correctly.
- This is because LSP always refers to characters by (line, column),
- so if we get the line number correct it shouldn't matter that there
- is a CR there. -/
- startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
+ /- Note (kmill): LSP always refers to characters by (line, column),
+ so converting CRLF to LF preserves line and column numbers. -/
+ startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩
def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
let doc := p.textDocument
diff --git a/src/lake/Lake/Build/Trace.lean b/src/lake/Lake/Build/Trace.lean
index bfe064fb8900..461b6b57c4b7 100644
--- a/src/lake/Lake/Build/Trace.lean
+++ b/src/lake/Lake/Build/Trace.lean
@@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Mac Malone
-/
-import Lake.Util.Newline
open System
namespace Lake
@@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩
def computeTextFileHash (file : FilePath) : IO Hash := do
let text ← IO.FS.readFile file
- let text := crlf2lf text
+ let text := text.crlfToLf
return Hash.ofString text
/--
diff --git a/src/lake/Lake/Util/Newline.lean b/src/lake/Lake/Util/Newline.lean
deleted file mode 100644
index fa0cb415f998..000000000000
--- a/src/lake/Lake/Util/Newline.lean
+++ /dev/null
@@ -1,24 +0,0 @@
-/-
-Copyright (c) 2023 Mario Carneiro. All rights reserved.
-Released under Apache 2.0 license as described in the file LICENSE.
-Authors: Mario Carneiro
--/
-
-namespace Lake
-
-/-- This is the same as `String.replace text "\r\n" "\n"`, but more efficient. -/
-@[inline] partial def crlf2lf (text : String) : String :=
- go "" 0 0
-where
- go (acc : String) (accStop pos : String.Pos) : String :=
- if h : text.atEnd pos then
- -- note: if accStop = 0 then acc is empty
- if accStop = 0 then text else acc ++ text.extract accStop pos
- else
- let c := text.get' pos h
- let pos' := text.next' pos h
- if c == '\r' && text.get pos' == '\n' then
- let acc := acc ++ text.extract accStop pos
- go acc pos' (text.next pos')
- else
- go acc accStop pos'
diff --git a/src/lake/tests/toml/Test.lean b/src/lake/tests/toml/Test.lean
index 5d025bfacab6..ca671fc64070 100644
--- a/src/lake/tests/toml/Test.lean
+++ b/src/lake/tests/toml/Test.lean
@@ -5,7 +5,6 @@ Authors: Mac Malone
-/
import Lake.Toml
import Lake.Util.Message
-import Lake.Util.Newline
/-!
## TOML Test Runner
@@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
| .ok bytes =>
if let some input := String.fromUTF8? bytes then
- pure (crlf2lf input)
+ pure input.crlfToLf
else
return .fail <| MessageLog.empty.add
{fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}
diff --git a/tests/lean/run/crlfToLf.lean b/tests/lean/run/crlfToLf.lean
new file mode 100644
index 000000000000..278c621dce3e
--- /dev/null
+++ b/tests/lean/run/crlfToLf.lean
@@ -0,0 +1,39 @@
+/-!
+# Test `String.crlfToLf`
+-/
+
+/-!
+Leaves single `\n`'s alone.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\nworld"
+
+/-!
+Turns `\r\n` into `\n`.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld"
+
+/-!
+In a string of `\r...\r\n`, only normalizes the last `\r\n`.
+-/
+/-- info: "hello\x0d\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"
+
+/-!
+Two in a row.
+-/
+/-- info: "hello\n\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"
+
+/-!
+Normalizes at the end.
+-/
+/-- info: "hello\nworld\n" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"
+
+/-!
+Can handle a loose `\r` as the last character.
+-/
+/-- info: "hello\nworld\x0d" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"
diff --git a/tests/lean/string_gaps.lean b/tests/lean/run/string_gaps.lean
similarity index 75%
rename from tests/lean/string_gaps.lean
rename to tests/lean/run/string_gaps.lean
index b6aadbe09c69..c0af8cf1448d 100644
--- a/tests/lean/string_gaps.lean
+++ b/tests/lean/run/string_gaps.lean
@@ -10,24 +10,32 @@ String gaps are described in RFC #2838
/-!
A string gap with no trailing space.
-/
+/-- info: "ab" -/
+#guard_msgs in
#eval "a\
b"
/-!
A string gap with trailing space before the `b`, which is consumed.
-/
+/-- info: "ab" -/
+#guard_msgs in
#eval "a\
b"
/-!
A string gap with space before the gap, which is not consumed.
-/
+/-- info: "a b" -/
+#guard_msgs in
#eval "a \
b"
/-!
Multiple string gaps in a row.
-/
+/-- info: "a b" -/
+#guard_msgs in
#eval "a \
\
\
@@ -36,16 +44,24 @@ Multiple string gaps in a row.
/-!
Two tests from the RFC.
-/
+/-- info: "this is a string" -/
+#guard_msgs in
#eval "this is \
a string"
+/-- info: "this is a string" -/
+#guard_msgs in
#eval "this is \
a string"
/-!
Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space.
-/
+/-- info: "there are three spaces between the brackets < >" -/
+#guard_msgs in
#eval "there are three spaces between the brackets < \
>"
+/-- info: "there are three spaces between the brackets < >" -/
+#guard_msgs in
#eval "there are three spaces between the brackets <\
\x20 >"
@@ -53,22 +69,32 @@ Two examples of how spaces are accounted for in string gaps. `\x20` is a way to
Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write
multiline literals in an indented context.
-/
+/-- info: "this is\n a string with two space indent" -/
+#guard_msgs in
#eval "this is\
\n a string with two space indent"
/-!
Similar tests but for interpolated strings.
-/
+/-- info: "ab" -/
+#guard_msgs in
#eval s!"a\
b"
+/-- info: "ab" -/
+#guard_msgs in
#eval s!"a\
b"
+/-- info: "ab" -/
+#guard_msgs in
#eval s!"a\
b"
/-!
The `{` terminates the string gap.
-/
+/-- info: "ab" -/
+#guard_msgs in
#eval s!"a\
{"b"}\
"
@@ -82,22 +108,18 @@ open Lean
/-!
Standard string gap, with LF
-/
+/-- info: "ab" -/
+#guard_msgs in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\""
let some s := stx.isStrLit? | failure
return s
-/-!
-Standard string gap, with CRLF
--/
-#eval show MetaM String from do
- let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\""
- let some s := stx.isStrLit? | failure
- return s
-
/-!
Isolated CR, which is an error
-/
+/-- error: :1:3: invalid escape sequence -/
+#guard_msgs (error, drop info) in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\""
let some s := stx.isStrLit? | failure
@@ -106,6 +128,8 @@ Isolated CR, which is an error
/-!
Not a string gap since there's no end-of-line.
-/
+/-- error: :1:3: invalid escape sequence -/
+#guard_msgs (error, drop info) in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\""
let some s := stx.isStrLit? | failure
@@ -134,6 +158,8 @@ elab "d!" s:str : term => do
let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax
pure $ Lean.mkStrLit s
+/-- info: "this is line 1\n line 2, indented\nline 3" -/
+#guard_msgs in
#eval d!"this is \
line 1
| line 2, indented
diff --git a/tests/lean/string_gaps.lean.expected.out b/tests/lean/string_gaps.lean.expected.out
deleted file mode 100644
index 386346a9ab02..000000000000
--- a/tests/lean/string_gaps.lean.expected.out
+++ /dev/null
@@ -1,20 +0,0 @@
-"ab"
-"ab"
-"a b"
-"a b"
-"this is a string"
-"this is a string"
-"there are three spaces between the brackets < >"
-"there are three spaces between the brackets < >"
-"this is\n a string with two space indent"
-"ab"
-"ab"
-"ab"
-"ab"
-"ab"
-"ab"
-
-string_gaps.lean:101:0-104:10: error: :1:4: expecting newline after carriage return
-
-string_gaps.lean:109:0-112:10: error: :1:3: invalid escape sequence
-"this is line 1\n line 2, indented\nline 3"