diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean index 2739c707d678..2cc70b3a3270 100644 --- a/src/Init/Data/String/Extra.lean +++ b/src/Init/Data/String/Extra.lean @@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String := let n := findLeadingSpacesSize s if n == 0 then s else removeNumLeadingSpaces n s +/-- +Replaces each `\r\n` with `\n` to normalize line endings, +but does not validate that there are no isolated `\r` characters. +It is an optimized version of `String.replace text "\r\n" "\n"`. +-/ +def crlfToLf (text : String) : String := + go "" 0 0 +where + go (acc : String) (accStop pos : String.Pos) : String := + if h : text.atEnd pos then + -- note: if accStop = 0 then acc is empty + if accStop = 0 then text else acc ++ text.extract accStop pos + else + let c := text.get' pos h + let pos' := text.next' pos h + if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then + let acc := acc ++ text.extract accStop pos + go acc pos' (text.next' pos' h'.1) + else + go acc accStop pos' + termination_by text.utf8ByteSize - pos.byteIdx + decreasing_by + decreasing_with + show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx + have k := Nat.gt_of_not_le <| mt decide_eq_true h + exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _)) + decreasing_with + show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx + have k := Nat.gt_of_not_le <| mt decide_eq_true h + exact Nat.sub_lt_sub_left k (String.lt_next _ _) + end String diff --git a/src/Lean/Data/Position.lean b/src/Lean/Data/Position.lean index 2af14fc623f4..7495b03f5d86 100644 --- a/src/Lean/Data/Position.lean +++ b/src/Lean/Data/Position.lean @@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos := /-- Returns the position of the start of (1-based) line `line`. -This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient. +This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient. -/ def lineStart (map : FileMap) (line : Nat) : String.Pos := if h : line - 1 < map.positions.size then diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean index 7ba45f62a652..253afbaefbbd 100644 --- a/src/Lean/Parser/Basic.lean +++ b/src/Lean/Parser/Basic.lean @@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s => let curr := input.get' i h if curr == '\t' then s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them" + else if curr == '\r' then + s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed" else if curr.isWhitespace then whitespace c (s.next' input i h) else if curr == '-' then let i := input.next' i h @@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s => /-- Parses the whitespace after the `\` when there is a string gap. Raises an error if the whitespace does not contain exactly one newline character. -Processes `\r\n` as a newline. -/ -partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s => +partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s => let i := s.pos if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline else @@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s => -- Having more than one newline in a string gap is visually confusing s.mkUnexpectedError "unexpected additional newline in string gap" else - stringGapFn true false c (s.next' c.input i h) - else if curr == '\r' then - stringGapFn seenNewline true c (s.next' c.input i h) - else if afterCR then - s.mkUnexpectedError "expecting newline after carriage return" + stringGapFn true c (s.next' c.input i h) else if curr.isWhitespace then - stringGapFn seenNewline false c (s.next' c.input i h) + stringGapFn seenNewline c (s.next' c.input i h) else if seenNewline then s else @@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn : andthenFn hexDigitFn hexDigitFn c (s.next' input i h) else if curr == 'u' then andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h) - else if inString && (curr == '\n' || curr == '\r') then - stringGapFn false false c s + else if inString && curr == '\n' then + stringGapFn false c s else s.mkUnexpectedError "invalid escape sequence" diff --git a/src/Lean/Parser/Extension.lean b/src/Lean/Parser/Extension.lean index 562900d894fd..03ee8095161c 100644 --- a/src/Lean/Parser/Extension.lean +++ b/src/Lean/Parser/Extension.lean @@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind := def getTokenTable (env : Environment) : TokenTable := (parserExtension.getState env).tokens -def mkInputContext (input : String) (fileName : String) : InputContext := { - input := input, - fileName := fileName, - fileMap := input.toFileMap -} +-- Note: `crlfToLf` preserves logical line and column numbers for each character. +def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext := + let input' := if normalizeLineEndings then input.crlfToLf else input + { input := input', + fileName := fileName, + fileMap := input'.toFileMap } def mkParserState (input : String) : ParserState := { cache := initCacheForInput input } @@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input) if !s.allErrors.isEmpty then Except.error (s.toErrorMsg ictx) - else if input.atEnd s.pos then + else if ictx.input.atEnd s.pos then Except.ok s.stxStack.back else Except.error ((s.mkError "end of input").toErrorMsg ictx) diff --git a/src/Lean/Server/FileWorker.lean b/src/Lean/Server/FileWorker.lean index 22db77f42503..8fa0c365f2f6 100644 --- a/src/Lean/Server/FileWorker.lean +++ b/src/Lean/Server/FileWorker.lean @@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do let initParams ← i.readLspRequestAs "initialize" InitializeParams let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams let doc := param.textDocument - /- NOTE(WN): `toFileMap` marks line beginnings as immediately following - "\n", which should be enough to handle both LF and CRLF correctly. - This is because LSP always refers to characters by (line, column), - so if we get the line number correct it shouldn't matter that there - is a CR there. -/ - let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩ + /- Note (kmill): LSP always refers to characters by (line, column), + so converting CRLF to LF preserves line and column numbers. -/ + let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩ let e := e.withPrefix s!"[{param.textDocument.uri}] " let _ ← IO.setStderr e let (ctx, st) ← try diff --git a/src/Lean/Server/Utils.lean b/src/Lean/Server/Utils.lean index b07428744e9c..01104108baf3 100644 --- a/src/Lean/Server/Utils.lean +++ b/src/Lean/Server/Utils.lean @@ -75,7 +75,10 @@ structure DocumentMeta where uri : Lsp.DocumentUri /-- Version number of the document. Incremented whenever the document is edited. -/ version : Nat - /-- Current text of the document. -/ + /-- + Current text of the document. + It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers. + (Note: we assume that edit operations never split or merge `\r\n` line endings.) -/ text : FileMap /-- Controls when dependencies of the document are built on `textDocument/didOpen` notifications. @@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa let «end» := text.lspPosToUtf8Pos r.«end» let pre := text.source.extract 0 start let post := text.source.extract «end» text.source.endPos - (pre ++ newText ++ post).toFileMap + -- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized. + -- Note: this assumes that editing never separates a `\r\n`. + -- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate. + -- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here. + (pre ++ newText.crlfToLf ++ post).toFileMap open IO @@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC | TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) => replaceLspRange oldText range newText | TextDocumentContentChangeEvent.fullChange (newText : String) => - newText.toFileMap + newText.crlfToLf.toFileMap /-- Returns the document contents with all changes applied. -/ def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap := diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean index 045d797ede2f..4d7c040759c6 100644 --- a/src/Lean/Server/Watchdog.lean +++ b/src/Lean/Server/Watchdog.lean @@ -728,12 +728,9 @@ end RequestHandling section NotificationHandling def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit := let doc := p.textDocument - /- NOTE(WN): `toFileMap` marks line beginnings as immediately following - "\n", which should be enough to handle both LF and CRLF correctly. - This is because LSP always refers to characters by (line, column), - so if we get the line number correct it shouldn't matter that there - is a CR there. -/ - startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩ + /- Note (kmill): LSP always refers to characters by (line, column), + so converting CRLF to LF preserves line and column numbers. -/ + startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩ def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do let doc := p.textDocument diff --git a/src/lake/Lake/Build/Trace.lean b/src/lake/Lake/Build/Trace.lean index bfe064fb8900..461b6b57c4b7 100644 --- a/src/lake/Lake/Build/Trace.lean +++ b/src/lake/Lake/Build/Trace.lean @@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. Authors: Mac Malone -/ -import Lake.Util.Newline open System namespace Lake @@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩ def computeTextFileHash (file : FilePath) : IO Hash := do let text ← IO.FS.readFile file - let text := crlf2lf text + let text := text.crlfToLf return Hash.ofString text /-- diff --git a/src/lake/Lake/Util/Newline.lean b/src/lake/Lake/Util/Newline.lean deleted file mode 100644 index fa0cb415f998..000000000000 --- a/src/lake/Lake/Util/Newline.lean +++ /dev/null @@ -1,24 +0,0 @@ -/- -Copyright (c) 2023 Mario Carneiro. All rights reserved. -Released under Apache 2.0 license as described in the file LICENSE. -Authors: Mario Carneiro --/ - -namespace Lake - -/-- This is the same as `String.replace text "\r\n" "\n"`, but more efficient. -/ -@[inline] partial def crlf2lf (text : String) : String := - go "" 0 0 -where - go (acc : String) (accStop pos : String.Pos) : String := - if h : text.atEnd pos then - -- note: if accStop = 0 then acc is empty - if accStop = 0 then text else acc ++ text.extract accStop pos - else - let c := text.get' pos h - let pos' := text.next' pos h - if c == '\r' && text.get pos' == '\n' then - let acc := acc ++ text.extract accStop pos - go acc pos' (text.next pos') - else - go acc accStop pos' diff --git a/src/lake/tests/toml/Test.lean b/src/lake/tests/toml/Test.lean index 5d025bfacab6..ca671fc64070 100644 --- a/src/lake/tests/toml/Test.lean +++ b/src/lake/tests/toml/Test.lean @@ -5,7 +5,6 @@ Authors: Mac Malone -/ import Lake.Toml import Lake.Util.Message -import Lake.Util.Newline /-! ## TOML Test Runner @@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with | .ok bytes => if let some input := String.fromUTF8? bytes then - pure (crlf2lf input) + pure input.crlfToLf else return .fail <| MessageLog.empty.add {fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"} diff --git a/tests/lean/run/crlfToLf.lean b/tests/lean/run/crlfToLf.lean new file mode 100644 index 000000000000..278c621dce3e --- /dev/null +++ b/tests/lean/run/crlfToLf.lean @@ -0,0 +1,39 @@ +/-! +# Test `String.crlfToLf` +-/ + +/-! +Leaves single `\n`'s alone. +-/ +/-- info: "hello\nworld" -/ +#guard_msgs in #eval String.crlfToLf "hello\nworld" + +/-! +Turns `\r\n` into `\n`. +-/ +/-- info: "hello\nworld" -/ +#guard_msgs in #eval String.crlfToLf "hello\r\nworld" + +/-! +In a string of `\r...\r\n`, only normalizes the last `\r\n`. +-/ +/-- info: "hello\x0d\nworld" -/ +#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld" + +/-! +Two in a row. +-/ +/-- info: "hello\n\nworld" -/ +#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld" + +/-! +Normalizes at the end. +-/ +/-- info: "hello\nworld\n" -/ +#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n" + +/-! +Can handle a loose `\r` as the last character. +-/ +/-- info: "hello\nworld\x0d" -/ +#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r" diff --git a/tests/lean/string_gaps.lean b/tests/lean/run/string_gaps.lean similarity index 75% rename from tests/lean/string_gaps.lean rename to tests/lean/run/string_gaps.lean index b6aadbe09c69..c0af8cf1448d 100644 --- a/tests/lean/string_gaps.lean +++ b/tests/lean/run/string_gaps.lean @@ -10,24 +10,32 @@ String gaps are described in RFC #2838 /-! A string gap with no trailing space. -/ +/-- info: "ab" -/ +#guard_msgs in #eval "a\ b" /-! A string gap with trailing space before the `b`, which is consumed. -/ +/-- info: "ab" -/ +#guard_msgs in #eval "a\ b" /-! A string gap with space before the gap, which is not consumed. -/ +/-- info: "a b" -/ +#guard_msgs in #eval "a \ b" /-! Multiple string gaps in a row. -/ +/-- info: "a b" -/ +#guard_msgs in #eval "a \ \ \ @@ -36,16 +44,24 @@ Multiple string gaps in a row. /-! Two tests from the RFC. -/ +/-- info: "this is a string" -/ +#guard_msgs in #eval "this is \ a string" +/-- info: "this is a string" -/ +#guard_msgs in #eval "this is \ a string" /-! Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space. -/ +/-- info: "there are three spaces between the brackets < >" -/ +#guard_msgs in #eval "there are three spaces between the brackets < \ >" +/-- info: "there are three spaces between the brackets < >" -/ +#guard_msgs in #eval "there are three spaces between the brackets <\ \x20 >" @@ -53,22 +69,32 @@ Two examples of how spaces are accounted for in string gaps. `\x20` is a way to Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write multiline literals in an indented context. -/ +/-- info: "this is\n a string with two space indent" -/ +#guard_msgs in #eval "this is\ \n a string with two space indent" /-! Similar tests but for interpolated strings. -/ +/-- info: "ab" -/ +#guard_msgs in #eval s!"a\ b" +/-- info: "ab" -/ +#guard_msgs in #eval s!"a\ b" +/-- info: "ab" -/ +#guard_msgs in #eval s!"a\ b" /-! The `{` terminates the string gap. -/ +/-- info: "ab" -/ +#guard_msgs in #eval s!"a\ {"b"}\ " @@ -82,22 +108,18 @@ open Lean /-! Standard string gap, with LF -/ +/-- info: "ab" -/ +#guard_msgs in #eval show MetaM String from do let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\"" let some s := stx.isStrLit? | failure return s -/-! -Standard string gap, with CRLF --/ -#eval show MetaM String from do - let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\"" - let some s := stx.isStrLit? | failure - return s - /-! Isolated CR, which is an error -/ +/-- error: :1:3: invalid escape sequence -/ +#guard_msgs (error, drop info) in #eval show MetaM String from do let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\"" let some s := stx.isStrLit? | failure @@ -106,6 +128,8 @@ Isolated CR, which is an error /-! Not a string gap since there's no end-of-line. -/ +/-- error: :1:3: invalid escape sequence -/ +#guard_msgs (error, drop info) in #eval show MetaM String from do let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\"" let some s := stx.isStrLit? | failure @@ -134,6 +158,8 @@ elab "d!" s:str : term => do let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax pure $ Lean.mkStrLit s +/-- info: "this is line 1\n line 2, indented\nline 3" -/ +#guard_msgs in #eval d!"this is \ line 1 | line 2, indented diff --git a/tests/lean/string_gaps.lean.expected.out b/tests/lean/string_gaps.lean.expected.out deleted file mode 100644 index 386346a9ab02..000000000000 --- a/tests/lean/string_gaps.lean.expected.out +++ /dev/null @@ -1,20 +0,0 @@ -"ab" -"ab" -"a b" -"a b" -"this is a string" -"this is a string" -"there are three spaces between the brackets < >" -"there are three spaces between the brackets < >" -"this is\n a string with two space indent" -"ab" -"ab" -"ab" -"ab" -"ab" -"ab" - -string_gaps.lean:101:0-104:10: error: :1:4: expecting newline after carriage return - -string_gaps.lean:109:0-112:10: error: :1:3: invalid escape sequence -"this is line 1\n line 2, indented\nline 3"