Rollup merge of #125316 - nnethercote:tweak-Spacing, r=petrochenkov

Tweak `Spacing` use Some clean-up precursors to #125174. r? ``@petrochenkov``
rust-lang · May 23, 2024 · 3c79f0c · 3c79f0c
2 parents f131ee6 + 4d513cb
commit 3c79f0c
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 31 deletions.
diff --git a/compiler/rustc_ast/src/tokenstream.rs b/compiler/rustc_ast/src/tokenstream.rs
@@ -661,11 +661,11 @@ impl TokenStream {
             if attr_style == AttrStyle::Inner {
                 vec![
                     TokenTree::token_joint(token::Pound, span),
-                    TokenTree::token_alone(token::Not, span),
+                    TokenTree::token_joint_hidden(token::Not, span),
                     body,
                 ]
             } else {
-                vec![TokenTree::token_alone(token::Pound, span), body]
+                vec![TokenTree::token_joint_hidden(token::Pound, span), body]
             }
         }
     }

diff --git a/compiler/rustc_ast_pretty/src/pprust/state.rs b/compiler/rustc_ast_pretty/src/pprust/state.rs
@@ -681,22 +681,40 @@ pub trait PrintState<'a>: std::ops::Deref<Target = pp::Printer> + std::ops::Dere
         }
     }
 
+    // The easiest way to implement token stream pretty printing would be to
+    // print each token followed by a single space. But that would produce ugly
+    // output, so we go to some effort to do better.
+    //
+    // First, we track whether each token that appears in source code is
+    // followed by a space, with `Spacing`, and reproduce that in the output.
+    // This works well in a lot of cases. E.g. `stringify!(x + y)` produces
+    // "x + y" and `stringify!(x+y)` produces "x+y".
+    //
+    // But this doesn't work for code produced by proc macros (which have no
+    // original source text representation) nor for code produced by decl
+    // macros (which are tricky because the whitespace after tokens appearing
+    // in macro rules isn't always what you want in the produced output). For
+    // these we mostly use `Spacing::Alone`, which is the conservative choice.
+    //
+    // So we have a backup mechanism for when `Spacing::Alone` occurs between a
+    // pair of tokens: we check if that pair of tokens can obviously go
+    // together without a space between them. E.g. token `x` followed by token
+    // `,` is better printed as `x,` than `x ,`. (Even if the original source
+    // code was `x ,`.)
+    //
+    // Finally, we must be careful about changing the output. Token pretty
+    // printing is used by `stringify!` and `impl Display for
+    // proc_macro::TokenStream`, and some programs rely on the output having a
+    // particular form, even though they shouldn't. In particular, some proc
+    // macros do `format!({stream})` on a token stream and then "parse" the
+    // output with simple string matching that can't handle whitespace changes.
+    // E.g. we have seen cases where a proc macro can handle `a :: b` but not
+    // `a::b`. See #117433 for some examples.
     fn print_tts(&mut self, tts: &TokenStream, convert_dollar_crate: bool) {
         let mut iter = tts.trees().peekable();
         while let Some(tt) = iter.next() {
             let spacing = self.print_tt(tt, convert_dollar_crate);
             if let Some(next) = iter.peek() {
-                // Should we print a space after `tt`? There are two guiding
-                // factors.
-                // - `spacing` is the more important and accurate one. Most
-                //   tokens have good spacing information, and
-                //   `Joint`/`JointHidden` get used a lot.
-                // - `space_between` is the backup. Code produced by proc
-                //   macros has worse spacing information, with no
-                //   `JointHidden` usage and too much `Alone` usage, which
-                //   would result in over-spaced output such as
-                //   `( x () , y . z )`. `space_between` avoids some of the
-                //   excess whitespace.
                 if spacing == Spacing::Alone && space_between(tt, next) {
                     self.space();
                 }

diff --git a/compiler/rustc_builtin_macros/src/assert/context.rs b/compiler/rustc_builtin_macros/src/assert/context.rs
@@ -153,7 +153,7 @@ impl<'cx, 'a> Context<'cx, 'a> {
     fn build_panic(&self, expr_str: &str, panic_path: Path) -> P<Expr> {
         let escaped_expr_str = escape_to_fmt(expr_str);
         let initial = [
-            TokenTree::token_joint_hidden(
+            TokenTree::token_joint(
                 token::Literal(token::Lit {
                     kind: token::LitKind::Str,
                     symbol: Symbol::intern(&if self.fmt_string.is_empty() {
@@ -172,7 +172,7 @@ impl<'cx, 'a> Context<'cx, 'a> {
         ];
         let captures = self.capture_decls.iter().flat_map(|cap| {
             [
-                TokenTree::token_joint_hidden(
+                TokenTree::token_joint(
                     token::Ident(cap.ident.name, IdentIsRaw::No),
                     cap.ident.span,
                 ),

diff --git a/compiler/rustc_expand/src/mbe.rs b/compiler/rustc_expand/src/mbe.rs
@@ -68,12 +68,15 @@ pub(crate) enum KleeneOp {
 /// `MetaVarExpr` are "first-class" token trees. Useful for parsing macros.
 #[derive(Debug, PartialEq, Encodable, Decodable)]
 enum TokenTree {
+    /// A token. Unlike `tokenstream::TokenTree::Token` this lacks a `Spacing`.
+    /// See the comments about `Spacing` in the `transcribe` function.
     Token(Token),
     /// A delimited sequence, e.g. `($e:expr)` (RHS) or `{ $e }` (LHS).
     Delimited(DelimSpan, DelimSpacing, Delimited),
     /// A kleene-style repetition sequence, e.g. `$($e:expr)*` (RHS) or `$($e),*` (LHS).
     Sequence(DelimSpan, SequenceRepetition),
-    /// e.g., `$var`.
+    /// e.g., `$var`. The span covers the leading dollar and the ident. (The span within the ident
+    /// only covers the ident, e.g. `var`.)
     MetaVar(Span, Ident),
     /// e.g., `$var:expr`. Only appears on the LHS.
     MetaVarDecl(Span, Ident /* name to bind */, Option<NonterminalKind>),

diff --git a/compiler/rustc_expand/src/mbe/quoted.rs b/compiler/rustc_expand/src/mbe/quoted.rs
@@ -62,7 +62,10 @@ pub(super) fn parse(
         match tree {
             TokenTree::MetaVar(start_sp, ident) if parsing_patterns => {
                 let span = match trees.next() {
-                    Some(&tokenstream::TokenTree::Token(Token { kind: token::Colon, span }, _)) => {
+                    Some(&tokenstream::TokenTree::Token(
+                        Token { kind: token::Colon, span: colon_span },
+                        _,
+                    )) => {
                         match trees.next() {
                             Some(tokenstream::TokenTree::Token(token, _)) => match token.ident() {
                                 Some((fragment, _)) => {
@@ -126,10 +129,12 @@ pub(super) fn parse(
                                 }
                                 _ => token.span,
                             },
-                            tree => tree.map_or(span, tokenstream::TokenTree::span),
+                            Some(tree) => tree.span(),
+                            None => colon_span,
                         }
                     }
-                    tree => tree.map_or(start_sp, tokenstream::TokenTree::span),
+                    Some(tree) => tree.span(),
+                    None => start_sp,
                 };
 
                 result.push(TokenTree::MetaVarDecl(span, ident, None));
@@ -176,7 +181,7 @@ fn parse_tree<'a>(
     // Depending on what `tree` is, we could be parsing different parts of a macro
     match tree {
         // `tree` is a `$` token. Look at the next token in `trees`
-        &tokenstream::TokenTree::Token(Token { kind: token::Dollar, span }, _) => {
+        &tokenstream::TokenTree::Token(Token { kind: token::Dollar, span: dollar_span }, _) => {
             // FIXME: Handle `Invisible`-delimited groups in a more systematic way
             // during parsing.
             let mut next = outer_trees.next();
@@ -209,7 +214,7 @@ fn parse_tree<'a>(
                                         err.emit();
                                         // Returns early the same read `$` to avoid spanning
                                         // unrelated diagnostics that could be performed afterwards
-                                        return TokenTree::token(token::Dollar, span);
+                                        return TokenTree::token(token::Dollar, dollar_span);
                                     }
                                     Ok(elem) => {
                                         maybe_emit_macro_metavar_expr_feature(
@@ -251,7 +256,7 @@ fn parse_tree<'a>(
                 // special metavariable that names the crate of the invocation.
                 Some(tokenstream::TokenTree::Token(token, _)) if token.is_ident() => {
                     let (ident, is_raw) = token.ident().unwrap();
-                    let span = ident.span.with_lo(span.lo());
+                    let span = ident.span.with_lo(dollar_span.lo());
                     if ident.name == kw::Crate && matches!(is_raw, IdentIsRaw::No) {
                         TokenTree::token(token::Ident(kw::DollarCrate, is_raw), span)
                     } else {
@@ -260,16 +265,19 @@ fn parse_tree<'a>(
                 }
 
                 // `tree` is followed by another `$`. This is an escaped `$`.
-                Some(&tokenstream::TokenTree::Token(Token { kind: token::Dollar, span }, _)) => {
+                Some(&tokenstream::TokenTree::Token(
+                    Token { kind: token::Dollar, span: dollar_span2 },
+                    _,
+                )) => {
                     if parsing_patterns {
                         span_dollar_dollar_or_metavar_in_the_lhs_err(
                             sess,
-                            &Token { kind: token::Dollar, span },
+                            &Token { kind: token::Dollar, span: dollar_span2 },
                         );
                     } else {
-                        maybe_emit_macro_metavar_expr_feature(features, sess, span);
+                        maybe_emit_macro_metavar_expr_feature(features, sess, dollar_span2);
                     }
-                    TokenTree::token(token::Dollar, span)
+                    TokenTree::token(token::Dollar, dollar_span2)
                 }
 
                 // `tree` is followed by some other token. This is an error.
@@ -281,7 +289,7 @@ fn parse_tree<'a>(
                 }
 
                 // There are no more tokens. Just return the `$` we already have.
-                None => TokenTree::token(token::Dollar, span),
+                None => TokenTree::token(token::Dollar, dollar_span),
             }
         }
 

diff --git a/compiler/rustc_expand/src/mbe/transcribe.rs b/compiler/rustc_expand/src/mbe/transcribe.rs
@@ -253,8 +253,23 @@ pub(super) fn transcribe<'a>(
             mbe::TokenTree::MetaVar(mut sp, mut original_ident) => {
                 // Find the matched nonterminal from the macro invocation, and use it to replace
                 // the meta-var.
+                //
+                // We use `Spacing::Alone` everywhere here, because that's the conservative choice
+                // and spacing of declarative macros is tricky. E.g. in this macro:
+                // ```
+                // macro_rules! idents {
+                //     ($($a:ident,)*) => { stringify!($($a)*) }
+                // }
+                // ```
+                // `$a` has no whitespace after it and will be marked `JointHidden`. If you then
+                // call `idents!(x,y,z,)`, each of `x`, `y`, and `z` will be marked as `Joint`. So
+                // if you choose to use `$x`'s spacing or the identifier's spacing, you'll end up
+                // producing "xyz", which is bad because it effectively merges tokens.
+                // `Spacing::Alone` is the safer option. Fortunately, `space_between` will avoid
+                // some of the unnecessary whitespace.
                 let ident = MacroRulesNormalizedIdent::new(original_ident);
                 if let Some(cur_matched) = lookup_cur_matched(ident, interp, &repeats) {
+                    // njn: explain the use of alone here
                     let tt = match cur_matched {
                         MatchedSingle(ParseNtResult::Tt(tt)) => {
                             // `tt`s are emitted into the output stream directly as "raw tokens",

diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs
@@ -309,10 +309,10 @@ impl ToInternal<SmallVec<[tokenstream::TokenTree; 2]>>
         use rustc_ast::token::*;
 
         // The code below is conservative, using `token_alone`/`Spacing::Alone`
-        // in most places. When the resulting code is pretty-printed by
-        // `print_tts` it ends up with spaces between most tokens, which is
-        // safe but ugly. It's hard in general to do better when working at the
-        // token level.
+        // in most places. It's hard in general to do better when working at
+        // the token level. When the resulting code is pretty-printed by
+        // `print_tts` the `space_between` function helps avoid a lot of
+        // unnecessary whitespace, so the results aren't too bad.
         let (tree, rustc) = self;
         match tree {
             TokenTree::Punct(Punct { ch, joint, span }) => {