From 39edcfa84e7e715d6ac78f3dfdadfb1046d31b56 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Sat, 14 Jan 2023 11:02:14 -0800
Subject: [PATCH 1/2] Add more nbsp to unicode-chars test

---
 tests/ui/parser/unicode-chars.rs     |  8 +++++-
 tests/ui/parser/unicode-chars.stderr | 39 +++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs
index ba35e95c82a49..b989205632155 100644
--- a/tests/ui/parser/unicode-chars.rs
+++ b/tests/ui/parser/unicode-chars.rs
@@ -2,8 +2,14 @@ fn main() {
     let y = 0;
     //~^ ERROR unknown start of token: \u{37e}
     //~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
-        let x = 0;
+        let x = 0;
     //~^ ERROR unknown start of token: \u{a0}
     //~^^ NOTE character appears 3 more times
     //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+    //~^^^^ ERROR unknown start of token: \u{a0}
+    //~^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+    //~^^^^^^ ERROR unknown start of token: \u{a0}
+    //~^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+    //~^^^^^^^^ ERROR unknown start of token: \u{a0}
+    //~^^^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
 }
diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr
index 6a5b27872e738..93ac5ec14be5b 100644
--- a/tests/ui/parser/unicode-chars.stderr
+++ b/tests/ui/parser/unicode-chars.stderr
@@ -12,14 +12,47 @@ LL |     let y = 0;
 error: unknown start of token: \u{a0}
   --> $DIR/unicode-chars.rs:5:5
    |
-LL |         let x = 0;
+LL |         let x = 0;
    |     ^^^^
    |
    = note: character appears 3 more times
 help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
    |
-LL |         let x = 0;
+LL |         let x = 0;
    |     ++++
 
-error: aborting due to 2 previous errors
+error: unknown start of token: \u{a0}
+  --> $DIR/unicode-chars.rs:5:12
+   |
+LL |         let x = 0;
+   |            ^
+   |
+help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+   |
+LL |         let x = 0;
+   |            +
+
+error: unknown start of token: \u{a0}
+  --> $DIR/unicode-chars.rs:5:14
+   |
+LL |         let x = 0;
+   |              ^
+   |
+help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+   |
+LL |         let x = 0;
+   |              +
+
+error: unknown start of token: \u{a0}
+  --> $DIR/unicode-chars.rs:5:16
+   |
+LL |         let x = 0;
+   |                ^
+   |
+help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+   |
+LL |         let x = 0;
+   |                +
+
+error: aborting due to 5 previous errors
 

From dab06ccdab57d160de2e9afea752929ad39ee534 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Sat, 14 Jan 2023 10:34:06 -0800
Subject: [PATCH 2/2] Emit only one nbsp error per file

---
 compiler/rustc_parse/src/lexer/mod.rs | 31 ++++++++++++++++++++----
 tests/ui/parser/unicode-chars.rs      |  6 -----
 tests/ui/parser/unicode-chars.stderr  | 35 +--------------------------
 3 files changed, 27 insertions(+), 45 deletions(-)

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 8761c23625b21..9fe8d9836ba60 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -52,8 +52,15 @@ pub(crate) fn parse_token_trees<'a>(
     }
 
     let cursor = Cursor::new(src);
-    let string_reader =
-        StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
+    let string_reader = StringReader {
+        sess,
+        start_pos,
+        pos: start_pos,
+        src,
+        cursor,
+        override_span,
+        nbsp_is_whitespace: false,
+    };
     tokentrees::TokenTreesReader::parse_all_token_trees(string_reader)
 }
 
@@ -68,6 +75,10 @@ struct StringReader<'a> {
     /// Cursor for getting lexer tokens.
     cursor: Cursor<'a>,
     override_span: Option<Span>,
+    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
+    /// in this file, it's safe to treat further occurrences of the non-breaking
+    /// space character as whitespace.
+    nbsp_is_whitespace: bool,
 }
 
 impl<'a> StringReader<'a> {
@@ -239,6 +250,16 @@ impl<'a> StringReader<'a> {
                     }
                     let mut it = self.str_from_to_end(start).chars();
                     let c = it.next().unwrap();
+                    if c == '\u{00a0}' {
+                        // If an error has already been reported on non-breaking
+                        // space characters earlier in the file, treat all
+                        // subsequent occurrences as whitespace.
+                        if self.nbsp_is_whitespace {
+                            preceded_by_whitespace = true;
+                            continue;
+                        }
+                        self.nbsp_is_whitespace = true;
+                    }
                     let repeats = it.take_while(|c1| *c1 == c).count();
                     let mut err =
                         self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
@@ -486,7 +507,7 @@ impl<'a> StringReader<'a> {
 
     /// Slice of the source text from `start` up to but excluding `self.pos`,
     /// meaning the slice does not include the character `self.ch`.
-    fn str_from(&self, start: BytePos) -> &str {
+    fn str_from(&self, start: BytePos) -> &'a str {
         self.str_from_to(start, self.pos)
     }
 
@@ -497,12 +518,12 @@ impl<'a> StringReader<'a> {
     }
 
     /// Slice of the source text spanning from `start` up to but excluding `end`.
-    fn str_from_to(&self, start: BytePos, end: BytePos) -> &str {
+    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'a str {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
     /// Slice of the source text spanning from `start` until the end
-    fn str_from_to_end(&self, start: BytePos) -> &str {
+    fn str_from_to_end(&self, start: BytePos) -> &'a str {
         &self.src[self.src_index(start)..]
     }
 
diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs
index b989205632155..f0122561f463d 100644
--- a/tests/ui/parser/unicode-chars.rs
+++ b/tests/ui/parser/unicode-chars.rs
@@ -6,10 +6,4 @@ fn main() {
     //~^ ERROR unknown start of token: \u{a0}
     //~^^ NOTE character appears 3 more times
     //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-    //~^^^^ ERROR unknown start of token: \u{a0}
-    //~^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-    //~^^^^^^ ERROR unknown start of token: \u{a0}
-    //~^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-    //~^^^^^^^^ ERROR unknown start of token: \u{a0}
-    //~^^^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
 }
diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr
index 93ac5ec14be5b..b1d4a0af71154 100644
--- a/tests/ui/parser/unicode-chars.stderr
+++ b/tests/ui/parser/unicode-chars.stderr
@@ -21,38 +21,5 @@ help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is
 LL |         let x = 0;
    |     ++++
 
-error: unknown start of token: \u{a0}
-  --> $DIR/unicode-chars.rs:5:12
-   |
-LL |         let x = 0;
-   |            ^
-   |
-help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-   |
-LL |         let x = 0;
-   |            +
-
-error: unknown start of token: \u{a0}
-  --> $DIR/unicode-chars.rs:5:14
-   |
-LL |         let x = 0;
-   |              ^
-   |
-help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-   |
-LL |         let x = 0;
-   |              +
-
-error: unknown start of token: \u{a0}
-  --> $DIR/unicode-chars.rs:5:16
-   |
-LL |         let x = 0;
-   |                ^
-   |
-help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
-   |
-LL |         let x = 0;
-   |                +
-
-error: aborting due to 5 previous errors
+error: aborting due to 2 previous errors