From 39edcfa84e7e715d6ac78f3dfdadfb1046d31b56 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Sat, 14 Jan 2023 11:02:14 -0800 Subject: [PATCH 1/2] Add more nbsp to unicode-chars test --- tests/ui/parser/unicode-chars.rs | 8 +++++- tests/ui/parser/unicode-chars.stderr | 39 +++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs index ba35e95c82a49..b989205632155 100644 --- a/tests/ui/parser/unicode-chars.rs +++ b/tests/ui/parser/unicode-chars.rs @@ -2,8 +2,14 @@ fn main() { let y = 0; //~^ ERROR unknown start of token: \u{37e} //~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not -     let x = 0; +     let x = 0; //~^ ERROR unknown start of token: \u{a0} //~^^ NOTE character appears 3 more times //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + //~^^^^ ERROR unknown start of token: \u{a0} + //~^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + //~^^^^^^ ERROR unknown start of token: \u{a0} + //~^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + //~^^^^^^^^ ERROR unknown start of token: \u{a0} + //~^^^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not } diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr index 6a5b27872e738..93ac5ec14be5b 100644 --- a/tests/ui/parser/unicode-chars.stderr +++ b/tests/ui/parser/unicode-chars.stderr @@ -12,14 +12,47 @@ LL | let y = 0; error: unknown start of token: \u{a0} --> $DIR/unicode-chars.rs:5:5 | -LL |     let x = 0; +LL |     let x = 0; | ^^^^ | = note: character appears 3 more times help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not | -LL | let x = 0; +LL | let x = 0; | ++++ -error: aborting due to 2 previous errors +error: unknown start of token: \u{a0} + --> $DIR/unicode-chars.rs:5:12 + | +LL |     let x = 0; + | ^ + | +help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + | +LL |     let x = 0; + | + + +error: unknown start of token: \u{a0} + --> $DIR/unicode-chars.rs:5:14 + | +LL |     let x = 0; + | ^ + | +help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + | +LL |     let x = 0; + | + + +error: unknown start of token: \u{a0} + --> $DIR/unicode-chars.rs:5:16 + | +LL |     let x = 0; + | ^ + | +help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + | +LL |     let x = 0; + | + + +error: aborting due to 5 previous errors From dab06ccdab57d160de2e9afea752929ad39ee534 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Sat, 14 Jan 2023 10:34:06 -0800 Subject: [PATCH 2/2] Emit only one nbsp error per file --- compiler/rustc_parse/src/lexer/mod.rs | 31 ++++++++++++++++++++---- tests/ui/parser/unicode-chars.rs | 6 ----- tests/ui/parser/unicode-chars.stderr | 35 +-------------------------- 3 files changed, 27 insertions(+), 45 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 8761c23625b21..9fe8d9836ba60 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -52,8 +52,15 @@ pub(crate) fn parse_token_trees<'a>( } let cursor = Cursor::new(src); - let string_reader = - StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span }; + let string_reader = StringReader { + sess, + start_pos, + pos: start_pos, + src, + cursor, + override_span, + nbsp_is_whitespace: false, + }; tokentrees::TokenTreesReader::parse_all_token_trees(string_reader) } @@ -68,6 +75,10 @@ struct StringReader<'a> { /// Cursor for getting lexer tokens. cursor: Cursor<'a>, override_span: Option, + /// When a "unknown start of token: \u{a0}" has already been emitted earlier + /// in this file, it's safe to treat further occurrences of the non-breaking + /// space character as whitespace. + nbsp_is_whitespace: bool, } impl<'a> StringReader<'a> { @@ -239,6 +250,16 @@ impl<'a> StringReader<'a> { } let mut it = self.str_from_to_end(start).chars(); let c = it.next().unwrap(); + if c == '\u{00a0}' { + // If an error has already been reported on non-breaking + // space characters earlier in the file, treat all + // subsequent occurrences as whitespace. + if self.nbsp_is_whitespace { + preceded_by_whitespace = true; + continue; + } + self.nbsp_is_whitespace = true; + } let repeats = it.take_while(|c1| *c1 == c).count(); let mut err = self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c); @@ -486,7 +507,7 @@ impl<'a> StringReader<'a> { /// Slice of the source text from `start` up to but excluding `self.pos`, /// meaning the slice does not include the character `self.ch`. - fn str_from(&self, start: BytePos) -> &str { + fn str_from(&self, start: BytePos) -> &'a str { self.str_from_to(start, self.pos) } @@ -497,12 +518,12 @@ impl<'a> StringReader<'a> { } /// Slice of the source text spanning from `start` up to but excluding `end`. - fn str_from_to(&self, start: BytePos, end: BytePos) -> &str { + fn str_from_to(&self, start: BytePos, end: BytePos) -> &'a str { &self.src[self.src_index(start)..self.src_index(end)] } /// Slice of the source text spanning from `start` until the end - fn str_from_to_end(&self, start: BytePos) -> &str { + fn str_from_to_end(&self, start: BytePos) -> &'a str { &self.src[self.src_index(start)..] } diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs index b989205632155..f0122561f463d 100644 --- a/tests/ui/parser/unicode-chars.rs +++ b/tests/ui/parser/unicode-chars.rs @@ -6,10 +6,4 @@ fn main() { //~^ ERROR unknown start of token: \u{a0} //~^^ NOTE character appears 3 more times //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - //~^^^^ ERROR unknown start of token: \u{a0} - //~^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - //~^^^^^^ ERROR unknown start of token: \u{a0} - //~^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - //~^^^^^^^^ ERROR unknown start of token: \u{a0} - //~^^^^^^^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not } diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr index 93ac5ec14be5b..b1d4a0af71154 100644 --- a/tests/ui/parser/unicode-chars.stderr +++ b/tests/ui/parser/unicode-chars.stderr @@ -21,38 +21,5 @@ help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is LL | let x = 0; | ++++ -error: unknown start of token: \u{a0} - --> $DIR/unicode-chars.rs:5:12 - | -LL |     let x = 0; - | ^ - | -help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - | -LL |     let x = 0; - | + - -error: unknown start of token: \u{a0} - --> $DIR/unicode-chars.rs:5:14 - | -LL |     let x = 0; - | ^ - | -help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - | -LL |     let x = 0; - | + - -error: unknown start of token: \u{a0} - --> $DIR/unicode-chars.rs:5:16 - | -LL |     let x = 0; - | ^ - | -help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not - | -LL |     let x = 0; - | + - -error: aborting due to 5 previous errors +error: aborting due to 2 previous errors