Skip to content

Commit f708670

Browse files
committed
Auto merge of #123951 - pitaj:reserve-guarded-strings, r=traviscross
Reserve guarded string literals (RFC 3593) Implementation for RFC 3593, including: - lexer / parser changes - diagnostics - migration lint - tests We reserve `#"`, `##"`, `###"`, `####`, and any other string of four or more repeated `#`. This avoids infinite lookahead in the lexer, though we still use infinite lookahead in the parser to provide better forward compatibility diagnostics. This PR does not implement any special lexing of the string internals: - strings preceded by one or more `#` are denied - regardless of the number of trailing `#` - string contents are lexed as if it was just a bare `"string"` Tracking issue: #123735 RFC: rust-lang/rfcs#3593
2 parents a1eceec + 321a5db commit f708670

23 files changed

+1514
-9
lines changed

compiler/rustc_lexer/src/lib.rs

+85-8
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ pub enum TokenKind {
104104
/// for emoji identifier recovery, as those are not meant to be ever accepted.
105105
InvalidPrefix,
106106

107+
/// Guarded string literal prefix: `#"` or `##`.
108+
///
109+
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
110+
/// Split into the component tokens on older editions.
111+
GuardedStrPrefix,
112+
107113
/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
108114
/// suffix, but may be present here on string and float literals. Users of
109115
/// this type will need to check for and reject that case.
@@ -191,30 +197,41 @@ pub enum DocStyle {
191197
/// `rustc_ast::ast::LitKind`).
192198
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
193199
pub enum LiteralKind {
194-
/// "12_u8", "0o100", "0b120i99", "1f32".
200+
/// `12_u8`, `0o100`, `0b120i99`, `1f32`.
195201
Int { base: Base, empty_int: bool },
196-
/// "12.34f32", "1e3", but not "1f32".
202+
/// `12.34f32`, `1e3`, but not `1f32`.
197203
Float { base: Base, empty_exponent: bool },
198-
/// "'a'", "'\\'", "'''", "';"
204+
/// `'a'`, `'\\'`, `'''`, `';`
199205
Char { terminated: bool },
200-
/// "b'a'", "b'\\'", "b'''", "b';"
206+
/// `b'a'`, `b'\\'`, `b'''`, `b';`
201207
Byte { terminated: bool },
202-
/// ""abc"", ""abc"
208+
/// `"abc"`, `"abc`
203209
Str { terminated: bool },
204-
/// "b"abc"", "b"abc"
210+
/// `b"abc"`, `b"abc`
205211
ByteStr { terminated: bool },
206212
/// `c"abc"`, `c"abc`
207213
CStr { terminated: bool },
208-
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
214+
/// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
209215
/// an invalid literal.
210216
RawStr { n_hashes: Option<u8> },
211-
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
217+
/// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
212218
/// indicates an invalid literal.
213219
RawByteStr { n_hashes: Option<u8> },
214220
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
215221
RawCStr { n_hashes: Option<u8> },
216222
}
217223

224+
/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
225+
///
226+
/// Can capture fewer closing hashes than starting hashes,
227+
/// for more efficient lexing and better backwards diagnostics.
228+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
229+
pub struct GuardedStr {
230+
pub n_hashes: u32,
231+
pub terminated: bool,
232+
pub token_len: u32,
233+
}
234+
218235
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
219236
pub enum RawStrError {
220237
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
@@ -403,6 +420,12 @@ impl Cursor<'_> {
403420
TokenKind::Literal { kind: literal_kind, suffix_start }
404421
}
405422

423+
// Guarded string literal prefix: `#"` or `##`
424+
'#' if matches!(self.first(), '"' | '#') => {
425+
self.bump();
426+
TokenKind::GuardedStrPrefix
427+
}
428+
406429
// One-symbol tokens.
407430
';' => Semi,
408431
',' => Comma,
@@ -780,6 +803,60 @@ impl Cursor<'_> {
780803
false
781804
}
782805

806+
/// Attempt to lex for a guarded string literal.
807+
///
808+
/// Used by `rustc_parse::lexer` to lex for guarded strings
809+
/// conditionally based on edition.
810+
///
811+
/// Note: this will not reset the `Cursor` when a
812+
/// guarded string is not found. It is the caller's
813+
/// responsibility to do so.
814+
pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
815+
debug_assert!(self.prev() != '#');
816+
817+
let mut n_start_hashes: u32 = 0;
818+
while self.first() == '#' {
819+
n_start_hashes += 1;
820+
self.bump();
821+
}
822+
823+
if self.first() != '"' {
824+
return None;
825+
}
826+
self.bump();
827+
debug_assert!(self.prev() == '"');
828+
829+
// Lex the string itself as a normal string literal
830+
// so we can recover that for older editions later.
831+
let terminated = self.double_quoted_string();
832+
if !terminated {
833+
let token_len = self.pos_within_token();
834+
self.reset_pos_within_token();
835+
836+
return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
837+
}
838+
839+
// Consume closing '#' symbols.
840+
// Note that this will not consume extra trailing `#` characters:
841+
// `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
842+
// followed by a `#` token.
843+
let mut n_end_hashes = 0;
844+
while self.first() == '#' && n_end_hashes < n_start_hashes {
845+
n_end_hashes += 1;
846+
self.bump();
847+
}
848+
849+
// Reserved syntax, always an error, so it doesn't matter if
850+
// `n_start_hashes != n_end_hashes`.
851+
852+
self.eat_literal_suffix();
853+
854+
let token_len = self.pos_within_token();
855+
self.reset_pos_within_token();
856+
857+
Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
858+
}
859+
783860
/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
784861
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
785862
// Wrap the actual function to handle the error with too many hashes.

compiler/rustc_lint/messages.ftl

+3
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,9 @@ lint_reserved_prefix = prefix `{$prefix}` is unknown
740740
.label = unknown prefix
741741
.suggestion = insert whitespace here to avoid this being parsed as a prefix in Rust 2021
742742
743+
lint_reserved_string = will be parsed as a guarded string in Rust 2024
744+
.suggestion = insert whitespace here to avoid this being parsed as a guarded string in Rust 2024
745+
743746
lint_shadowed_into_iter =
744747
this method call resolves to `<&{$target} as IntoIterator>::into_iter` (due to backwards compatibility), but will resolve to `<{$target} as IntoIterator>::into_iter` in Rust {$edition}
745748
.use_iter_suggestion = use `.iter()` instead of `.into_iter()` to avoid ambiguity

compiler/rustc_lint/src/context/diagnostics.rs

+3
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,9 @@ pub(super) fn decorate_lint(sess: &Session, diagnostic: BuiltinLintDiag, diag: &
176176
lints::RawPrefix { label: label_span, suggestion: label_span.shrink_to_hi() }
177177
.decorate_lint(diag);
178178
}
179+
BuiltinLintDiag::ReservedString(suggestion) => {
180+
lints::ReservedString { suggestion }.decorate_lint(diag);
181+
}
179182
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
180183
lints::UnusedBuiltinAttribute { invoc_span, attr_name, macro_name }.decorate_lint(diag);
181184
}

compiler/rustc_lint/src/lints.rs

+7
Original file line numberDiff line numberDiff line change
@@ -3053,3 +3053,10 @@ pub(crate) enum MutRefSugg {
30533053
#[derive(LintDiagnostic)]
30543054
#[diag(lint_unqualified_local_imports)]
30553055
pub(crate) struct UnqualifiedLocalImportsDiag {}
3056+
3057+
#[derive(LintDiagnostic)]
3058+
#[diag(lint_reserved_string)]
3059+
pub(crate) struct ReservedString {
3060+
#[suggestion(code = " ", applicability = "machine-applicable")]
3061+
pub suggestion: Span,
3062+
}

compiler/rustc_lint_defs/src/builtin.rs

+41
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ declare_lint_pass! {
9292
RUST_2021_INCOMPATIBLE_OR_PATTERNS,
9393
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
9494
RUST_2021_PRELUDE_COLLISIONS,
95+
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
9596
RUST_2024_INCOMPATIBLE_PAT,
9697
RUST_2024_PRELUDE_COLLISIONS,
9798
SELF_CONSTRUCTOR_FROM_OUTER_ITEM,
@@ -4996,3 +4997,43 @@ declare_lint! {
49964997
Warn,
49974998
"detects pointer to integer transmutes in const functions and associated constants",
49984999
}
5000+
5001+
declare_lint! {
5002+
/// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
5003+
/// that will be parsed as part of a guarded string literal in Rust 2024.
5004+
///
5005+
/// ### Example
5006+
///
5007+
/// ```rust,edition2021,compile_fail
5008+
/// #![deny(rust_2024_guarded_string_incompatible_syntax)]
5009+
///
5010+
/// macro_rules! m {
5011+
/// (# $x:expr #) => ();
5012+
/// (# $x:expr) => ();
5013+
/// }
5014+
///
5015+
/// m!(#"hey"#);
5016+
/// m!(#"hello");
5017+
/// ```
5018+
///
5019+
/// {{produces}}
5020+
///
5021+
/// ### Explanation
5022+
///
5023+
/// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
5024+
/// followed by the string literal `"hey"` then the final `#`.
5025+
/// In Rust 2024, the whole sequence is considered a single token.
5026+
///
5027+
/// This lint suggests to add whitespace between the leading `#`
5028+
/// and the string to keep them separated in Rust 2024.
5029+
// Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
5030+
#[allow(rustdoc::invalid_rust_codeblocks)]
5031+
pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
5032+
Allow,
5033+
"will be parsed as a guarded string in Rust 2024",
5034+
@future_incompatible = FutureIncompatibleInfo {
5035+
reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
5036+
reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
5037+
};
5038+
crate_level_only
5039+
}

compiler/rustc_lint_defs/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,8 @@ pub enum BuiltinLintDiag {
614614
ReservedPrefix(Span, String),
615615
/// `'r#` in edition < 2021.
616616
RawPrefix(Span),
617+
/// `##` or `#"` is edition < 2024.
618+
ReservedString(Span),
617619
TrailingMacro(bool, Ident),
618620
BreakWithLabelAndLoop(Span),
619621
UnicodeTextFlow(Span, String),

compiler/rustc_parse/messages.ftl

+4
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
706706
.label = the label
707707
.suggestion = add `:` after the label
708708
709+
parse_reserved_string = invalid string literal
710+
.note = unprefixed guarded string literals are reserved for future use since Rust 2024
711+
.suggestion_whitespace = consider inserting whitespace here
712+
709713
parse_return_types_use_thin_arrow = return types are denoted using `->`
710714
.suggestion = use `->` instead
711715

compiler/rustc_parse/src/errors.rs

+18
Original file line numberDiff line numberDiff line change
@@ -2110,6 +2110,24 @@ pub(crate) enum UnknownPrefixSugg {
21102110
},
21112111
}
21122112

2113+
#[derive(Diagnostic)]
2114+
#[diag(parse_reserved_string)]
2115+
#[note]
2116+
pub(crate) struct ReservedString {
2117+
#[primary_span]
2118+
pub span: Span,
2119+
#[subdiagnostic]
2120+
pub sugg: Option<GuardedStringSugg>,
2121+
}
2122+
#[derive(Subdiagnostic)]
2123+
#[suggestion(
2124+
parse_suggestion_whitespace,
2125+
code = " ",
2126+
applicability = "maybe-incorrect",
2127+
style = "verbose"
2128+
)]
2129+
pub(crate) struct GuardedStringSugg(#[primary_span] pub Span);
2130+
21132131
#[derive(Diagnostic)]
21142132
#[diag(parse_too_many_hashes)]
21152133
pub(crate) struct TooManyHashes {

compiler/rustc_parse/src/lexer/mod.rs

+83-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
1010
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
1111
use rustc_session::lint::BuiltinLintDiag;
1212
use rustc_session::lint::builtin::{
13-
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
13+
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
14+
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
1415
};
1516
use rustc_session::parse::ParseSess;
1617
use rustc_span::symbol::Symbol;
@@ -251,6 +252,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
251252
let prefix_span = self.mk_sp(start, lit_start);
252253
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
253254
}
255+
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
254256
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
255257
let suffix_start = start + BytePos(suffix_start);
256258
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -781,6 +783,86 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
781783
}
782784
}
783785

786+
/// Detect guarded string literal syntax
787+
///
788+
/// RFC 3598 reserved this syntax for future use. As of Rust 2024,
789+
/// using this syntax produces an error. In earlier editions, however, it
790+
/// only results in an (allowed by default) lint, and is treated as
791+
/// separate tokens.
792+
fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
793+
let span = self.mk_sp(start, self.pos);
794+
let edition2024 = span.edition().at_least_rust_2024();
795+
796+
let space_pos = start + BytePos(1);
797+
let space_span = self.mk_sp(space_pos, space_pos);
798+
799+
let mut cursor = Cursor::new(str_before);
800+
801+
let (span, unterminated) = match cursor.guarded_double_quoted_string() {
802+
Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
803+
let end = start + BytePos(token_len);
804+
let span = self.mk_sp(start, end);
805+
let str_start = start + BytePos(n_hashes);
806+
807+
if edition2024 {
808+
self.cursor = cursor;
809+
self.pos = end;
810+
}
811+
812+
let unterminated = if terminated { None } else { Some(str_start) };
813+
814+
(span, unterminated)
815+
}
816+
_ => {
817+
// We should only get here in the `##+` case.
818+
debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
819+
820+
(span, None)
821+
}
822+
};
823+
if edition2024 {
824+
if let Some(str_start) = unterminated {
825+
// Only a fatal error if string is unterminated.
826+
self.dcx()
827+
.struct_span_fatal(
828+
self.mk_sp(str_start, self.pos),
829+
"unterminated double quote string",
830+
)
831+
.with_code(E0765)
832+
.emit()
833+
}
834+
835+
let sugg = if span.from_expansion() {
836+
None
837+
} else {
838+
Some(errors::GuardedStringSugg(space_span))
839+
};
840+
841+
// In Edition 2024 and later, emit a hard error.
842+
let err = self.dcx().emit_err(errors::ReservedString { span, sugg });
843+
844+
token::Literal(token::Lit {
845+
kind: token::Err(err),
846+
symbol: self.symbol_from_to(start, self.pos),
847+
suffix: None,
848+
})
849+
} else {
850+
// Before Rust 2024, only emit a lint for migration.
851+
self.psess.buffer_lint(
852+
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
853+
span,
854+
ast::CRATE_NODE_ID,
855+
BuiltinLintDiag::ReservedString(space_span),
856+
);
857+
858+
// For backwards compatibility, roll back to after just the first `#`
859+
// and return the `Pound` token.
860+
self.pos = start + BytePos(1);
861+
self.cursor = Cursor::new(&str_before[1..]);
862+
token::Pound
863+
}
864+
}
865+
784866
fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
785867
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
786868
}

src/librustdoc/html/highlight.rs

+1
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,7 @@ impl<'src> Classifier<'src> {
845845
// Number literals.
846846
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
847847
},
848+
TokenKind::GuardedStrPrefix => return no_highlight(sink),
848849
TokenKind::Ident | TokenKind::RawIdent if lookahead == Some(TokenKind::Bang) => {
849850
self.in_macro = true;
850851
sink(Highlight::EnterSpan { class: Class::Macro(self.new_span(before, text)) });

src/tools/rust-analyzer/crates/parser/src/lexed_str.rs

+6
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,12 @@ impl<'a> Converter<'a> {
187187
}
188188

189189
rustc_lexer::TokenKind::RawIdent => IDENT,
190+
191+
rustc_lexer::TokenKind::GuardedStrPrefix => {
192+
err = "Invalid string literal (reserved syntax)";
193+
ERROR
194+
},
195+
190196
rustc_lexer::TokenKind::Literal { kind, .. } => {
191197
self.extend_literal(token_text.len(), kind);
192198
return;

0 commit comments

Comments
 (0)