diff --git a/crates/oxc_estree_tokens/src/lib.rs b/crates/oxc_estree_tokens/src/lib.rs index 70c3bf3bc5e4d..dce186d29e0fe 100644 --- a/crates/oxc_estree_tokens/src/lib.rs +++ b/crates/oxc_estree_tokens/src/lib.rs @@ -43,13 +43,65 @@ impl Config for TokenConfig { type CompactTokenSerializer = ESTreeSerializer; type PrettyTokenSerializer = ESTreeSerializer; -pub struct EstreeToken<'a> { +/// Token whose value is guaranteed JSON-safe. +/// +/// Used for identifiers, keywords, punctuators, numbers, booleans, `null` — any token whose +/// raw source text cannot contain quotes, backslashes, or control characters. +/// +/// Both `token_type` and `value` are wrapped in `JsonSafeString` during serialization, +/// skipping the byte-by-byte escape check that `EstreeUnsafeToken` performs. +struct EstreeSafeToken<'a> { + token_type: TokenType, + value: &'a str, + span: Span, +} + +impl ESTree for EstreeSafeToken<'_> { + fn serialize(&self, mut serializer: S) { + // Assemble the JSON string manually using `print_strs_array`. + // This reduces bounds checks to only 1 per token. + static TYPE_PREFIX: &str = "{\"type\":\""; + static VALUE_PREFIX: &str = "\",\"value\":\""; + static START_PREFIX: &str = "\",\"start\":"; + static END_PREFIX: &str = ",\"end\":"; + static POSTFIX: &str = "}"; + + static TYPE_PREFIX_PRETTY: &str = "{\n \"type\": \""; + static VALUE_PREFIX_PRETTY: &str = "\",\n \"value\": \""; + static START_PREFIX_PRETTY: &str = "\",\n \"start\": "; + static END_PREFIX_PRETTY: &str = ",\n \"end\": "; + static POSTFIX_PRETTY: &str = "\n }"; + + let mut start_buffer = U32String::new(); + let start = start_buffer.format(self.span.start); + + let mut end_buffer = U32String::new(); + let end = end_buffer.format(self.span.end); + + serializer.buffer_mut().print_strs_array([ + if S::IS_COMPACT { TYPE_PREFIX } else { TYPE_PREFIX_PRETTY }, + self.token_type.as_str(), + if S::IS_COMPACT { VALUE_PREFIX } else { VALUE_PREFIX_PRETTY }, + self.value, + if S::IS_COMPACT { START_PREFIX } else { START_PREFIX_PRETTY }, + start, + if S::IS_COMPACT { END_PREFIX } else { END_PREFIX_PRETTY }, + end, + if S::IS_COMPACT { POSTFIX } else { POSTFIX_PRETTY }, + ]); + } +} + +/// Token whose value may not be JSON-safe. +/// +/// Used for strings, templates, regexes, JSXText. +pub struct EstreeUnsafeToken<'a> { pub token_type: TokenType, pub value: &'a str, pub span: Span, } -impl ESTree for EstreeToken<'_> { +impl ESTree for EstreeUnsafeToken<'_> { fn serialize(&self, serializer: S) { let mut state = serializer.serialize_struct(); state.serialize_field("type", &JsonSafeString(self.token_type.as_str())); @@ -59,7 +111,7 @@ impl ESTree for EstreeToken<'_> { } } -/// Token type for RegExps. +/// `RegularExpression` token. /// /// This is a separate type from `EstreeToken` because RegExp tokens have a nested `regex` object /// containing `flags` and `pattern`, and the token type is always `"RegularExpression"`. @@ -99,57 +151,6 @@ impl ESTree for RegExpData<'_> { } } -/// Token type for identifiers (and keywords which are used as identifiers). -/// -/// Despite the name, this type is also used for tokens with `"Keyword"` or `"PrivateIdentifier"` -/// type — it covers any token whose value is guaranteed JSON-safe. -/// -/// This is a separate type from `EstreeToken` for two reasons: -/// 1. It has no `regex` field (identifiers never have regex data). -/// 2. Value is wrapped in `JsonSafeString` during serialization, skipping escape-checking. -/// Identifier names are guaranteed JSON-safe (no quotes, backslashes, or control characters). -struct EstreeIdentToken<'a> { - token_type: TokenType, - value: &'a str, - span: Span, -} - -impl ESTree for EstreeIdentToken<'_> { - fn serialize(&self, mut serializer: S) { - // Assemble the JSON string manually using `print_strs_array`. - // This reduces bounds checks to only 1 per token. - static TYPE_PREFIX: &str = "{\"type\":\""; - static VALUE_PREFIX: &str = "\",\"value\":\""; - static START_PREFIX: &str = "\",\"start\":"; - static END_PREFIX: &str = ",\"end\":"; - static POSTFIX: &str = "}"; - - static TYPE_PREFIX_PRETTY: &str = "{\n \"type\": \""; - static VALUE_PREFIX_PRETTY: &str = "\",\n \"value\": \""; - static START_PREFIX_PRETTY: &str = "\",\n \"start\": "; - static END_PREFIX_PRETTY: &str = ",\n \"end\": "; - static POSTFIX_PRETTY: &str = "\n }"; - - let mut start_buffer = U32String::new(); - let start = start_buffer.format(self.span.start); - - let mut end_buffer = U32String::new(); - let end = end_buffer.format(self.span.end); - - serializer.buffer_mut().print_strs_array([ - if S::IS_COMPACT { TYPE_PREFIX } else { TYPE_PREFIX_PRETTY }, - self.token_type.as_str(), - if S::IS_COMPACT { VALUE_PREFIX } else { VALUE_PREFIX_PRETTY }, - self.value, - if S::IS_COMPACT { START_PREFIX } else { START_PREFIX_PRETTY }, - start, - if S::IS_COMPACT { END_PREFIX } else { END_PREFIX_PRETTY }, - end, - if S::IS_COMPACT { POSTFIX } else { POSTFIX_PRETTY }, - ]); - } -} - /// Token type. /// Wrapped in a module to ensure `TokenType` can only be constructed via `TokenType::new`. mod token_type { @@ -356,22 +357,15 @@ struct EstreeTokenContext<'b, S: SequenceSerializer> { } impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> { - /// Serialize all tokens before `start` with default types, - /// then serialize the token at `start` with the given `token_type`. - fn emit_token_at(&mut self, start: u32, token_type: TokenType) { - let token = self.advance_to(start); - self.emit_token(token, token_type); - } - - /// Emit the token at `start` as `"Identifier"`, unless it's a legacy keyword - /// and `exclude_legacy_keyword_identifiers` is set (in which case it gets its default type). + /// Emit the token at `start` as `Identifier`, unless it's a legacy keyword and + /// `exclude_legacy_keyword_identifiers` is set (in which case it gets `Keyword` type). /// /// `name` is the decoded identifier name from the AST node. /// When the token has no escapes, `name` points into the source text, same as slicing it. /// When the token has escapes and `decode_identifier_escapes` is enabled, `name` provides /// the decoded value. Only when escapes are present but decoding is disabled do we need to /// fall back to slicing the raw source text (preserving the escape sequences in the output). - fn emit_identifier(&mut self, start: u32, name: &str) { + fn emit_identifier_at(&mut self, start: u32, name: &str) { let token = self.advance_to(start); let token_type = if self.options.exclude_legacy_keyword_identifiers && matches!(token.kind(), Kind::Yield | Kind::Let | Kind::Static) @@ -385,7 +379,7 @@ impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> { // Only fall back to raw source text when the token contains escapes and decoding is disabled, // since raw escape sequences contain `\` which needs JSON escaping. if self.options.decode_identifier_escapes || !token.escaped() { - self.serialize_ident_token(token, token_type, name); + self.serialize_safe_token(token, token_type, name); } else { #[cold] #[inline(never)] @@ -394,7 +388,7 @@ impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> { token: &Token, token_type: TokenType, ) { - ctx.emit_token(token, token_type); + ctx.emit_unsafe_token(token, token_type); } emit(self, token, token_type); } @@ -402,24 +396,39 @@ impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> { /// Emit the `this` keyword at `start` as `"Identifier"`. /// Used for `this` in TS type queries and TS `this` parameters. - fn emit_this_identifier(&mut self, start: u32) { - let token = self.advance_to(start); - self.serialize_ident_token(token, TokenType::new("Identifier"), "this"); + fn emit_this_identifier_at(&mut self, start: u32) { + self.emit_safe_token_at(start, TokenType::new("Identifier"), "this"); } /// Emit the token at `start` as `"JSXIdentifier"`. /// JSX identifier names are guaranteed JSON-safe (no unicode escapes, no special characters). - fn emit_jsx_identifier(&mut self, start: u32, name: &str) { + fn emit_jsx_identifier_at(&mut self, start: u32, name: &str) { + self.emit_safe_token_at(start, TokenType::new("JSXIdentifier"), name); + } + + /// Emit the token at `start` as specified token type, + /// where the token's `value` is guaranteed JSON-safe. + fn emit_safe_token_at(&mut self, start: u32, token_type: TokenType, value: &str) { + let token = self.advance_to(start); + self.serialize_safe_token(token, token_type, value); + } + + /// Emit the token at `start` as specified token type, + /// where the token's `value` may not be JSON-safe. + fn emit_unsafe_token_at(&mut self, start: u32, token_type: TokenType) { let token = self.advance_to(start); - self.serialize_ident_token(token, TokenType::new("JSXIdentifier"), name); + self.emit_unsafe_token(token, token_type); } /// Consume all tokens before `start` (emitting them with default types), /// and return the token at `start`. + /// + /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types + /// (strings, templates, regexes, JSXText) are dealt with by their own visitors. fn advance_to(&mut self, start: u32) -> &'b Token { while let Some(token) = self.tokens.next() { if token.start() < start { - self.emit_token(token, get_token_type(token.kind())); + self.emit_safe_token(token); } else { debug_assert_eq!( token.start(), @@ -433,38 +442,55 @@ impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> { unreachable!("Expected token at position {start}"); } - /// Serialize a single token using its raw source text as the value. - fn emit_token(&mut self, token: &Token, token_type: TokenType) { + /// Serialize a single token using its raw source text, skipping JSON escape checking. + /// + /// Used for tokens whose values are guaranteed JSON-safe + /// (punctuators, keywords, numbers, booleans, `null`). + /// + /// The token's type is determined by the token's `Kind`. + fn emit_safe_token(&mut self, token: &Token) { + let token_type = get_token_type(token.kind()); let value = &self.source_text[token.start() as usize..token.end() as usize]; - self.serialize_token(token, token_type, value); + self.serialize_safe_token(token, token_type, value); } - /// Convert span to UTF-16 and serialize token. - fn serialize_token(&mut self, token: &Token, token_type: TokenType, value: &str) { + /// Serialize a single token using its raw source text, with JSON escape checking. + /// Used for tokens whose values may contain backslashes, quotes, or control characters + /// (string literals, template literals, JSXText). + fn emit_unsafe_token(&mut self, token: &Token, token_type: TokenType) { + let value = &self.source_text[token.start() as usize..token.end() as usize]; + self.serialize_unsafe_token(token, token_type, value); + } + + /// Serialize a token whose value is guaranteed JSON-safe, skipping escape-checking. + fn serialize_safe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { // Convert offsets to UTF-16 let mut span = Span::new(token.start(), token.end()); if let Some(converter) = self.span_converter.as_mut() { converter.convert_span(&mut span); } - self.seq.serialize_element(&EstreeToken { token_type, value, span }); + self.seq.serialize_element(&EstreeSafeToken { token_type, value, span }); } - /// Serialize a token whose value is guaranteed JSON-safe, skipping escape-checking. - fn serialize_ident_token(&mut self, token: &Token, token_type: TokenType, value: &str) { + /// Serialize a token whose value may not be JSON-safe. + fn serialize_unsafe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { // Convert offsets to UTF-16 let mut span = Span::new(token.start(), token.end()); if let Some(converter) = self.span_converter.as_mut() { converter.convert_span(&mut span); } - self.seq.serialize_element(&EstreeIdentToken { token_type, value, span }); + self.seq.serialize_element(&EstreeUnsafeToken { token_type, value, span }); } /// Serialize all remaining tokens and close the sequence. + /// + /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types + /// (strings, templates, regexes, JSXText) are consumed by their own visitors. fn finish(mut self) { while let Some(token) = self.tokens.next() { - self.emit_token(token, get_token_type(token.kind())); + self.emit_safe_token(token); } self.seq.end(); } @@ -478,7 +504,7 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { ) { match type_name { TSTypeName::ThisExpression(this_expression) => { - ctx.emit_this_identifier(this_expression.span.start); + ctx.emit_this_identifier_at(this_expression.span.start); } TSTypeName::QualifiedName(qualified_name) => { collect_type_query_this(ctx, &qualified_name.left); @@ -489,7 +515,7 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { match &type_query.expr_name { TSTypeQueryExprName::ThisExpression(this_expression) => { - self.emit_this_identifier(this_expression.span.start); + self.emit_this_identifier_at(this_expression.span.start); } TSTypeQueryExprName::QualifiedName(qualified_name) => { collect_type_query_this(self, &qualified_name.left); @@ -502,10 +528,12 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { fn visit_ts_import_type(&mut self, import_type: &TSImportType<'a>) { // Manual walk. - // * `source` is a `StringLiteral` — can't contain identifiers, so we skip visiting it. + // * `source` is a `StringLiteral` — visit to ensure it's emitted with escape checking + // (string values are not JSON-safe). // * `options` is an `ObjectExpression`. Manually walk each property, but don't visit the key if it's `with`, // as it needs to remain "Keyword" token, not get converted to "Identifier". // * `qualifier` and `type_arguments` are visited as usual. + self.visit_string_literal(&import_type.source); if let Some(options) = &import_type.options { for property in &options.properties { match property { @@ -539,9 +567,9 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { && self.jsx_member_expression_depth > 0 && self.jsx_computed_member_depth == 0 { - self.emit_jsx_identifier(identifier.span.start, &identifier.name); + self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { - self.emit_identifier(identifier.span.start, &identifier.name); + self.emit_identifier_at(identifier.span.start, &identifier.name); } } @@ -551,18 +579,18 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { && self.jsx_member_expression_depth > 0 && self.jsx_computed_member_depth == 0 { - self.emit_jsx_identifier(identifier.span.start, &identifier.name); + self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { - self.emit_identifier(identifier.span.start, &identifier.name); + self.emit_identifier_at(identifier.span.start, &identifier.name); } } fn visit_binding_identifier(&mut self, identifier: &BindingIdentifier<'a>) { - self.emit_identifier(identifier.span.start, &identifier.name); + self.emit_identifier_at(identifier.span.start, &identifier.name); } fn visit_label_identifier(&mut self, identifier: &LabelIdentifier<'a>) { - self.emit_identifier(identifier.span.start, &identifier.name); + self.emit_identifier_at(identifier.span.start, &identifier.name); } fn visit_private_identifier(&mut self, identifier: &PrivateIdentifier<'a>) { @@ -572,18 +600,14 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { // Only fall back to slicing raw source text when the token contains escapes and decoding // is disabled, since raw escape sequences contain `\` which needs JSON escaping. if self.options.decode_identifier_escapes || !token.escaped() { - self.serialize_ident_token( - token, - TokenType::new("PrivateIdentifier"), - &identifier.name, - ); + self.serialize_safe_token(token, TokenType::new("PrivateIdentifier"), &identifier.name); } else { #[cold] #[inline(never)] fn emit(ctx: &mut EstreeTokenContext<'_, S>, token: &Token) { // Strip leading `#` let value = &ctx.source_text[token.start() as usize + 1..token.end() as usize]; - ctx.serialize_token(token, TokenType::new("PrivateIdentifier"), value); + ctx.serialize_unsafe_token(token, TokenType::new("PrivateIdentifier"), value); } emit(self, token); } @@ -609,7 +633,7 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { } fn visit_ts_this_parameter(&mut self, parameter: &TSThisParameter<'a>) { - self.emit_this_identifier(parameter.this_span.start); + self.emit_this_identifier_at(parameter.this_span.start); walk::walk_ts_this_parameter(self, parameter); } @@ -654,12 +678,12 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { } fn visit_jsx_identifier(&mut self, identifier: &JSXIdentifier<'a>) { - self.emit_jsx_identifier(identifier.span.start, &identifier.name); + self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } fn visit_jsx_element_name(&mut self, name: &JSXElementName<'a>) { if let JSXElementName::IdentifierReference(identifier) = name { - self.emit_jsx_identifier(identifier.span.start, &identifier.name); + self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { walk::walk_jsx_element_name(self, name); } @@ -667,7 +691,7 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { fn visit_jsx_member_expression_object(&mut self, object: &JSXMemberExpressionObject<'a>) { if let JSXMemberExpressionObject::IdentifierReference(identifier) = object { - self.emit_jsx_identifier(identifier.span.start, &identifier.name); + self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { walk::walk_jsx_member_expression_object(self, object); } @@ -675,8 +699,8 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { fn visit_jsx_namespaced_name(&mut self, name: &JSXNamespacedName<'a>) { if self.options.jsx_namespace_jsx_identifiers { - self.emit_jsx_identifier(name.namespace.span.start, &name.namespace.name); - self.emit_jsx_identifier(name.name.span.start, &name.name.name); + self.emit_jsx_identifier_at(name.namespace.span.start, &name.namespace.name); + self.emit_jsx_identifier_at(name.name.span.start, &name.name.name); } // When `!jsx_namespace_jsx_identifiers`, these tokens retain their default type } @@ -715,6 +739,14 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { self.jsx_expression_depth -= 1; } + fn visit_string_literal(&mut self, literal: &StringLiteral<'a>) { + self.emit_unsafe_token_at(literal.span.start, TokenType::new("String")); + } + + fn visit_jsx_text(&mut self, text: &JSXText<'a>) { + self.emit_unsafe_token_at(text.span.start, TokenType::new("JSXText")); + } + fn visit_jsx_attribute(&mut self, attribute: &JSXAttribute<'a>) { // Manual walk. // * `name`: Visit normally. @@ -722,25 +754,80 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> { self.visit_jsx_attribute_name(&attribute.name); match &attribute.value { Some(JSXAttributeValue::StringLiteral(string_literal)) => { - self.emit_token_at(string_literal.span.start, TokenType::new("JSXText")); + self.emit_unsafe_token_at(string_literal.span.start, TokenType::new("JSXText")); } Some(value) => self.visit_jsx_attribute_value(value), None => {} } } + + fn visit_template_literal(&mut self, literal: &TemplateLiteral<'a>) { + // Manual walk. + // Interleave quasis and expressions in source order. + // The default walk visits all quasis first, then all expressions, + // which would break `advance_to`'s source-order token consumption. + self.emit_template_quasis_interleaved( + &literal.quasis, + Visit::visit_expression, + &literal.expressions, + ); + } + + fn visit_ts_template_literal_type(&mut self, literal: &TSTemplateLiteralType<'a>) { + // Same as `visit_template_literal` but with TS types instead of expressions + self.emit_template_quasis_interleaved( + &literal.quasis, + Visit::visit_ts_type, + &literal.types, + ); + } +} + +impl EstreeTokenContext<'_, S> { + /// Emit template quasis interleaved with their interpolated parts (expressions or TS types). + /// + /// `TemplateElement.span` excludes delimiters (parser adjusts `start + 1`), + /// so subtract 1 to get the token start position. + fn emit_template_quasis_interleaved( + &mut self, + quasis: &[TemplateElement<'_>], + mut visit_interpolation: impl FnMut(&mut Self, &I), + interpolations: &[I], + ) { + let mut quasis = quasis.iter(); + + // First quasi (TemplateHead or NoSubstitutionTemplate) + if let Some(quasi) = quasis.next() { + self.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); + } + + // Remaining quasis interleaved with interpolations + for (interpolation, quasi) in interpolations.iter().zip(quasis) { + visit_interpolation(self, interpolation); + self.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); + } + } } fn get_token_type(kind: Kind) -> TokenType { + // Token's with these `Kind`s are always consumed by specific visitors and should never reach here + debug_assert!( + !matches!( + kind, + Kind::Str + | Kind::RegExp + | Kind::JSXText + | Kind::PrivateIdentifier + | Kind::NoSubstitutionTemplate + | Kind::TemplateHead + | Kind::TemplateMiddle + | Kind::TemplateTail + ), + "Token kind {kind:?} should be consumed by its visitor, and not reach `get_token_type`", + ); + match kind { Kind::Ident | Kind::Await => TokenType::new("Identifier"), - Kind::PrivateIdentifier => TokenType::new("PrivateIdentifier"), - Kind::JSXText => TokenType::new("JSXText"), - Kind::Str => TokenType::new("String"), - Kind::RegExp => TokenType::new("RegularExpression"), - Kind::NoSubstitutionTemplate - | Kind::TemplateHead - | Kind::TemplateMiddle - | Kind::TemplateTail => TokenType::new("Template"), Kind::True | Kind::False => TokenType::new("Boolean"), Kind::Null => TokenType::new("Null"), _ if kind.is_number() => TokenType::new("Numeric"),