From 25c2e25a9bfaeb24d374780c8ab009168b33a15d Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Sun, 1 Mar 2026 16:48:29 +0000 Subject: [PATCH] feat(estree/tokens): add function to update tokens in place (#19856) Add a function `update_tokens` which converts tokens to ESTree format in place, mutating the `Token`s in the `Vec` in place. This is what will be used for sending tokens to JS side via raw transfer. Support for serializing tokens to JSON is also retained. The 2 implementations share the same AST visitor, and hook into it via separate `Context` implementations. --- crates/oxc_estree_tokens/Cargo.toml | 2 +- crates/oxc_estree_tokens/src/lib.rs | 1 + crates/oxc_estree_tokens/src/serialize.rs | 869 +++++++++++++++------- crates/oxc_parser/src/lexer/kind.rs | 4 + tasks/benchmark/benches/parser.rs | 17 +- 5 files changed, 611 insertions(+), 282 deletions(-) diff --git a/crates/oxc_estree_tokens/Cargo.toml b/crates/oxc_estree_tokens/Cargo.toml index fa20c58b7d7d5..765cd3a0776af 100644 --- a/crates/oxc_estree_tokens/Cargo.toml +++ b/crates/oxc_estree_tokens/Cargo.toml @@ -24,6 +24,6 @@ oxc_ast = { workspace = true } oxc_ast_visit = { workspace = true, features = ["serialize"] } oxc_data_structures = { workspace = true, features = ["assert_unchecked"] } oxc_estree = { workspace = true, features = ["serialize"] } -oxc_parser = { workspace = true } +oxc_parser = { workspace = true, features = ["mutate_tokens"] } oxc_span = { workspace = true, features = ["serialize"] } itoa = { workspace = true } diff --git a/crates/oxc_estree_tokens/src/lib.rs b/crates/oxc_estree_tokens/src/lib.rs index a40f5d2d5e45f..fab531a6d8c62 100644 --- a/crates/oxc_estree_tokens/src/lib.rs +++ b/crates/oxc_estree_tokens/src/lib.rs @@ -13,6 +13,7 @@ pub use jsx_state::{JSXState, JSXStateJS, JSXStateTS}; pub use options::{ ESTreeTokenConfig, ESTreeTokenOptions, ESTreeTokenOptionsJS, ESTreeTokenOptionsTS, }; +pub use serialize::update_tokens; use serialize::{estimate_json_len, serialize_tokens}; /// Serializer config for tokens. diff --git a/crates/oxc_estree_tokens/src/serialize.rs b/crates/oxc_estree_tokens/src/serialize.rs index e18c8e8dfdd6d..1e0bd422df31e 100644 --- a/crates/oxc_estree_tokens/src/serialize.rs +++ b/crates/oxc_estree_tokens/src/serialize.rs @@ -1,4 +1,4 @@ -use std::slice::Iter; +use std::slice::{Iter, IterMut}; use oxc_ast::ast::*; use oxc_ast_visit::{ @@ -12,6 +12,10 @@ use oxc_span::{GetSpan, Span}; use crate::{ESTreeTokenConfig, JSXState, token_type::TokenType, u32_string::U32String}; +// ============================================================================================== +// Entry points +// ============================================================================================== + /// Estimate size of tokens serialized to JSON, in bytes. /// Aim is to allocate capacity which is a reasonable over-estimate for the size of all tokens serialized to JSON, /// in order to ensure the serializer's buffer never has to grow during serialization. @@ -56,7 +60,10 @@ pub fn estimate_json_len(tokens_len: usize, source_text_len: usize, is_compact: token_len * tokens_len + source_text_len + header_footer_len } +/// Serialize tokens to JSON using provided serializer. +/// /// Walk AST and serialize each token into the serializer as it's encountered. +/// Also convert token spans from UTF-8 byte offsets to UTF-16 offsets. /// /// Tokens are consumed from the `tokens` slice in source order. /// When a visitor method encounters an AST node that requires a token type override @@ -70,18 +77,429 @@ pub fn serialize_tokens( span_converter: &Utf8ToUtf16, options: O, ) { - let mut context = ESTreeTokenContext { - seq: serializer.serialize_sequence(), - tokens: tokens.iter(), - source_text, - span_converter: span_converter.converter(), - options, - jsx_state: O::JSXState::default(), + let mut visitor = Visitor { + ctx: JsonContext { + seq: serializer.serialize_sequence(), + tokens: tokens.iter(), + source_text, + span_converter: span_converter.converter(), + options, + jsx_state: O::JSXState::default(), + }, + }; + visitor.visit_program(program); + visitor.ctx.finish(); +} + +/// Walk AST and update token kinds to match ESTree token types. +/// Convert token spans from UTF-8 byte offsets to UTF-16 offsets. +/// +/// After this pass, `get_token_type(token.kind())` returns the correct ESTree token type +/// for every token, without needing AST context. +pub fn update_tokens( + tokens: &mut [Token], + program: &Program<'_>, + span_converter: &Utf8ToUtf16, + options: O, +) { + let mut visitor = Visitor { + ctx: UpdateContext { + tokens: tokens.iter_mut(), + span_converter: span_converter.converter(), + options, + jsx_state: O::JSXState::default(), + }, }; - context.visit_program(program); - context.finish(); + visitor.visit_program(program); + visitor.ctx.finish(); +} + +// ============================================================================================== +// `Context` trait +// ============================================================================================== + +/// Trait abstracting over the two token processing modes: +/// JSON serialization ([`JsonContext`]) and in-place kind update ([`UpdateContext`]). +/// +/// Each implementation holds its own `options` and `jsx_state`, so `is_ts` / `is_js` +/// resolve statically when the generic `O: ESTreeTokenConfig` is monomorphized. +trait Context: Sized { + /// JSX state type for tracking when to emit JSX identifiers. + type JSXState: JSXState; + + /// Returns `true` if serializing in TS style. + fn is_ts(&self) -> bool; + + /// Returns `true` if serializing in JS style. + #[expect(clippy::inline_always)] + #[inline(always)] + fn is_js(&self) -> bool { + !self.is_ts() + } + + /// Get reference to [`JSXState`] for the serializer/updater. + fn jsx_state(&self) -> &Self::JSXState; + + /// Get mutable reference to [`JSXState`] for the serializer/updater. + fn jsx_state_mut(&mut self) -> &mut Self::JSXState; + + /// Emit the token at `start` as an identifier. + /// + /// * JSON mode: Serialize with type `Identifier` or `Keyword`. + /// * Update mode: Set kind to `Kind::Ident`, unless in JS style and the token is `yield` / `let` / `static` + /// (which should remain as `Keyword`). + fn emit_identifier_at(&mut self, start: u32, name: &str); + + /// Emit the `this` keyword at `start` as `Identifier`. + /// + /// * JSON mode: Serialize as `Identifier` / `"this"`. + /// * Update mode: Set kind to `Kind::Ident`. + fn emit_this_identifier_at(&mut self, start: u32); + + /// Emit the token at `start` as `JSXIdentifier`. + /// + /// * JSON mode: Serialize as `JSXIdentifier`. + /// * Update mode: Set kind to `Kind::JSXIdentifier`. + fn emit_jsx_identifier_at(&mut self, start: u32, name: &str); + + /// Emit the token at `start` as `PrivateIdentifier`. + /// + /// * JSON mode: Serialize with appropriate encoding. + /// * Update mode: No-op (token already has `Kind::PrivateIdentifier`). + fn emit_private_identifier_at(&mut self, start: u32, name: &str); + + /// Emit a `StringLiteral` in a JSX attribute as `JSXText`. + /// + /// Unlike [`emit_unsafe_token_at`], this changes the token's kind in update mode, + /// because the token has `Kind::Str` but needs to become `Kind::JSXText`. + /// Use [`emit_unsafe_token_at`] for actual `JSXText` tokens which already have the correct kind. + /// + /// * JSON mode: Serialize as `JSXText` with JSON encoding. + /// * Update mode: Set kind to `Kind::JSXText`. + /// + /// [`emit_unsafe_token_at`]: Context::emit_unsafe_token_at + fn emit_jsx_text_at(&mut self, start: u32); + + /// Emit a token whose value may not be JSON-safe (strings, templates, JSXText). + /// + /// * JSON mode: Serialize with JSON encoding. + /// * Update mode: No-op (token already has the correct kind). + fn emit_unsafe_token_at(&mut self, start: u32, token_type: TokenType); + + /// Emit a `RegularExpression` token. + /// + /// * JSON mode: Serialize using `ESTreeRegExpToken`. + /// * Update mode: No-op (token already has the correct kind). + fn emit_regexp(&mut self, regexp: &RegExpLiteral<'_>); + + /// Walk template quasis interleaved with their interpolated parts (expressions or TS types). + /// + /// * JSON mode: Emit quasi tokens interleaved with interpolation visits. + /// * Update mode: Only visit interpolations (quasis don't need `Kind` changes). + fn walk_template_quasis_interleaved( + visitor: &mut Visitor, + quasis: &[TemplateElement<'_>], + visit_interpolation: impl FnMut(&mut Visitor, &I), + interpolations: &[I], + ); } +// ============================================================================================== +// `JsonContext` — JSON serialization +// ============================================================================================== + +/// JSON serialization context. +/// +/// Serializes each token to JSON with its correct ESTree token type. +struct JsonContext<'b, O: ESTreeTokenConfig, S: SequenceSerializer> { + /// JSON sequence serializer. + /// Tokens are serialized into this serializer. + seq: S, + /// Tokens iterator (immutable - tokens are read, not modified) + tokens: Iter<'b, Token>, + /// Source text (for extracting token values) + source_text: &'b str, + /// Span converter for UTF-8 to UTF-16 conversion. + /// `None` if source is ASCII-only. + span_converter: Option>, + /// Options controlling JS/TS style differences + options: O, + // JSX state. Used when outputting tokens in TS style. + jsx_state: O::JSXState, +} + +impl<'b, O: ESTreeTokenConfig, S: SequenceSerializer> JsonContext<'b, O, S> { + /// Consume all tokens before `start` (emitting them with default types), + /// and return the token at `start`. + /// + /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types + /// (strings, templates, regexes, JSXText) are dealt with by their own visitors. + fn advance_to(&mut self, start: u32) -> &'b Token { + while let Some(token) = self.tokens.next() { + if token.start() < start { + self.emit_default_token(token); + } else { + debug_assert_eq!( + token.start(), + start, + "Expected token at position {start}, found token at position {}", + token.start(), + ); + return token; + } + } + unreachable!("Expected token at position {start}"); + } + + /// Serialize a token with its default type (determined by its `Kind`). + /// + /// Token values serialized here are guaranteed JSON-safe + /// (punctuators, keywords, numbers, booleans, `null`). + fn emit_default_token(&mut self, token: &Token) { + let kind = token.kind(); + + // Tokens with these `Kind`s are always consumed by specific visitors and should never reach here + debug_assert!( + !matches!( + kind, + Kind::Str + | Kind::RegExp + | Kind::JSXText + | Kind::PrivateIdentifier + | Kind::NoSubstitutionTemplate + | Kind::TemplateHead + | Kind::TemplateMiddle + | Kind::TemplateTail + ), + "Token kind {kind:?} should be consumed by its visitor, and not reach `get_token_type`", + ); + + let token_type = match kind { + Kind::Ident | Kind::Await => TokenType::new("Identifier"), + Kind::True | Kind::False => TokenType::new("Boolean"), + Kind::Null => TokenType::new("Null"), + _ if kind.is_number() => TokenType::new("Numeric"), + _ if kind.is_contextual_keyword() => TokenType::new("Identifier"), + _ if kind.is_any_keyword() => TokenType::new("Keyword"), + _ => TokenType::new("Punctuator"), + }; + + let value = &self.source_text[token.start() as usize..token.end() as usize]; + + self.serialize_safe_token(token, token_type, value); + } + + fn emit_safe_token_at(&mut self, start: u32, token_type: TokenType, value: &str) { + let token = self.advance_to(start); + self.serialize_safe_token(token, token_type, value); + } + + /// Serialize a token using its raw source text, with JSON encoding. + /// + /// Used for tokens whose values may contain backslashes, quotes, or control characters + /// (escaped identifiers, string literals, template literals, JSXText). + fn emit_unsafe_token(&mut self, token: &Token, token_type: TokenType) { + let value = &self.source_text[token.start() as usize..token.end() as usize]; + self.serialize_unsafe_token(token, token_type, value); + } + + /// Serialize a token whose value is guaranteed JSON-safe, skipping JSON-encoding. + fn serialize_safe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { + let span = self.get_utf16_span(token); + self.seq.serialize_element(&ESTreeSafeToken { token_type, value, span }); + } + + /// Serialize a token whose value may not be JSON-safe. + fn serialize_unsafe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { + let span = self.get_utf16_span(token); + self.seq.serialize_element(&ESTreeUnsafeToken { token_type, value, span }); + } + + /// Get UTF-16 span for a token. + fn get_utf16_span(&mut self, token: &Token) -> Span { + let mut span = Span::new(token.start(), token.end()); + if let Some(converter) = self.span_converter.as_mut() { + converter.convert_span(&mut span); + } + span + } + + /// Serialize all remaining tokens and close the sequence. + /// + /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types + /// (escaped identifiers, strings, templates, regexes, JSXText) are consumed by their own visitors. + fn finish(mut self) { + while let Some(token) = self.tokens.next() { + self.emit_default_token(token); + } + self.seq.end(); + } +} + +impl Context for JsonContext<'_, O, S> { + /// JSX state type for tracking when to emit JSX identifiers. + /// Inherited from config. + type JSXState = O::JSXState; + + /// Returns `true` if serializing in JS style. + #[expect(clippy::inline_always)] + #[inline(always)] + fn is_ts(&self) -> bool { + self.options.is_ts() + } + + /// Get reference to [`JSXState`] for the serializer/updater. + #[expect(clippy::inline_always)] + #[inline(always)] + fn jsx_state(&self) -> &Self::JSXState { + &self.jsx_state + } + + /// Get mutable reference to [`JSXState`] for the serializer/updater. + #[expect(clippy::inline_always)] + #[inline(always)] + fn jsx_state_mut(&mut self) -> &mut Self::JSXState { + &mut self.jsx_state + } + + /// Emit the token at `start` as `Identifier`, unless it's a legacy keyword and serializing in JS style + /// (in which case it gets `Keyword` type). + fn emit_identifier_at(&mut self, start: u32, name: &str) { + let token = self.advance_to(start); + + let token_type = + if self.is_js() && matches!(token.kind(), Kind::Yield | Kind::Let | Kind::Static) { + TokenType::new("Keyword") + } else { + TokenType::new("Identifier") + }; + + // `name` is from AST, has escapes decoded by the parser, and is JSON-safe. + // Use it in most cases — if token is not marked as escaped, it's JSON-safe, so can skip JSON encoding. + // When `self.options.decode_identifier_escapes` is `true`, token `value` should *always* be + // the unescaped version, so can also use `name` from AST node and skip JSON encoding. + // Only fall back to raw source text when the token contains escapes *and* decoding is disabled, + // since escape sequences contain `\` which needs JSON escaping. + // Escaped identifiers are extremely rare, so handle them in `#[cold]` branch. + if self.is_js() || !token.escaped() { + self.serialize_safe_token(token, token_type, name); + } else { + #[cold] + #[inline(never)] + fn emit( + ctx: &mut JsonContext<'_, O, S>, + token: &Token, + token_type: TokenType, + ) { + ctx.emit_unsafe_token(token, token_type); + } + emit(self, token, token_type); + } + } + + /// Emit the `this` keyword at `start` as `Identifier`. + /// Used for `this` in TS type queries and TS `this` parameters. + fn emit_this_identifier_at(&mut self, start: u32) { + self.emit_safe_token_at(start, TokenType::new("Identifier"), "this"); + } + + /// Emit the token at `start` as `JSXIdentifier`. + /// JSX identifier names are guaranteed JSON-safe (no unicode escapes, no special characters). + fn emit_jsx_identifier_at(&mut self, start: u32, name: &str) { + self.emit_safe_token_at(start, TokenType::new("JSXIdentifier"), name); + } + + /// Emit the token at `start` as `PrivateIdentifier`. + fn emit_private_identifier_at(&mut self, start: u32, name: &str) { + let token = self.advance_to(start); + + // `identifier.name` has `#` stripped and escapes decoded by the parser, and is JSON-safe. + // Use it in most cases — if token is not marked as escaped, it's JSON-safe, so can skip JSON encoding. + // When `self.is_js()` is `true`, token `value` should *always* be the unescaped version, + // so can also use `name` from AST node and skip JSON encoding. + // Only fall back to raw source text when the token contains escapes *and* decoding is disabled, + // since escape sequences contain `\` which needs JSON escaping. + // Escaped identifiers are extremely rare, so handle them in `#[cold]` branch. + if self.is_js() || !token.escaped() { + self.serialize_safe_token(token, TokenType::new("PrivateIdentifier"), name); + } else { + #[cold] + #[inline(never)] + fn emit( + ctx: &mut JsonContext<'_, O, S>, + token: &Token, + ) { + // Strip leading `#` + let value = &ctx.source_text[token.start() as usize + 1..token.end() as usize]; + ctx.serialize_unsafe_token(token, TokenType::new("PrivateIdentifier"), value); + } + emit(self, token); + } + } + + /// Emit the token at `start` as `JSXText`. + fn emit_jsx_text_at(&mut self, start: u32) { + let token = self.advance_to(start); + self.emit_unsafe_token(token, TokenType::new("JSXText")); + } + + /// Emit the token at `start` as the specified token type, + /// where the token's `value` may not be JSON-safe. + fn emit_unsafe_token_at(&mut self, start: u32, token_type: TokenType) { + let token = self.advance_to(start); + self.emit_unsafe_token(token, token_type); + } + + /// Emit token for `RegExpLiteral`. + fn emit_regexp(&mut self, regexp: &RegExpLiteral<'_>) { + let token = self.advance_to(regexp.span.start); + + let value = regexp.raw.as_deref().unwrap(); + let pattern = regexp.regex.pattern.text.as_str(); + + // Flags start after opening `/`, pattern, and closing `/` + let flags = &value[pattern.len() + 2..]; + let regex = RegExpData { pattern, flags }; + + let span = self.get_utf16_span(token); + self.seq.serialize_element(&ESTreeRegExpToken { value, regex, span }); + } + + /// Emit template quasis interleaved with their interpolated parts (expressions or TS types). + /// + /// `TemplateElement.span` excludes delimiters (parser adjusts `start + 1`), + /// so subtract 1 to get the token start position. + fn walk_template_quasis_interleaved( + visitor: &mut Visitor, + quasis: &[TemplateElement<'_>], + mut visit_interpolation: impl FnMut(&mut Visitor, &I), + interpolations: &[I], + ) { + // Quasis and interpolations must be walked in interleaved source order, + // because `advance_to` consumes tokens sequentially. + // The default `walk_template_literal` visits all quasis first, then all expressions, + // which would break source-order token consumption. + let mut quasis = quasis.iter(); + + // First quasi (TemplateHead or NoSubstitutionTemplate). + // `TemplateElement.span` excludes delimiters (parser adjusts `start + 1`), + // so subtract 1 to get the token start position. + if let Some(quasi) = quasis.next() { + visitor.ctx.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); + } + + // Remaining quasis interleaved with interpolations + for (interpolation, quasi) in interpolations.iter().zip(quasis) { + visit_interpolation(visitor, interpolation); + visitor.ctx.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); + } + } +} + +// ============================================================================================== +// Token serialization structs (used only by `JsonContext`) +// ============================================================================================== + /// Token whose `value` is guaranteed JSON-safe. /// /// Used for identifiers, keywords, punctuators, numbers, booleans, `null` — @@ -191,182 +609,181 @@ impl ESTree for RegExpData<'_> { } } -/// Visitor that walks the AST and serializes tokens as it encounters them. -/// -/// Tokens are consumed from `tokens` iterator in source order. When a visitor method encounters -/// an AST node that requires a token type override, all preceding tokens are serialized -/// with their default types, then the overridden token is serialized with its corrected type. -/// After the AST walk, any remaining tokens are serialized with default types. +// ============================================================================================== +// `UpdateContext` — in-place token `Kind` mutation +// ============================================================================================== + +/// In-place kind update context. /// -/// This works because AST visitation occurs in source order, so same order as tokens in the iterator. -struct ESTreeTokenContext<'b, O: ESTreeTokenConfig, S: SequenceSerializer> { - /// JSON sequence serializer. - /// Tokens are serialized into this serializer. - seq: S, - /// Tokens iterator - tokens: Iter<'b, Token>, - /// Source text (for extracting token values) - source_text: &'b str, +/// Updates token kinds so that `get_token_type(token.kind())` returns +/// the correct ESTree token type without needing AST context. +/// Also converts token spans from UTF-8 byte offsets to UTF-16 offsets. +struct UpdateContext<'b, O: ESTreeTokenConfig> { + /// Mutable tokens iterator + tokens: IterMut<'b, Token>, /// Span converter for UTF-8 to UTF-16 conversion. /// `None` if source is ASCII-only. span_converter: Option>, - /// Options + /// Options controlling JS/TS style differences options: O, - // JSX state. Used when outputting tokens in TS style. + /// JSX state tracking jsx_state: O::JSXState, } -impl<'b, O: ESTreeTokenConfig, S: SequenceSerializer> ESTreeTokenContext<'b, O, S> { - /// Emit the token at `start` as `Identifier`, unless it's a legacy keyword and serializing in JS style - /// (in which case it gets `Keyword` type). - /// - /// `name` is the decoded identifier name from the AST node. - /// When the token has no escapes, `name` points into the source text, same as slicing it. - /// When the token has escapes and using JS style, `name` provides the decoded value. - /// Only when escapes are present but decoding is disabled (TS style) do we need to fall back to - /// slicing the raw source text (preserving the escape sequences in the output). - fn emit_identifier_at(&mut self, start: u32, name: &str) { - let token = self.advance_to(start); - let token_type = if self.options.is_js() - && matches!(token.kind(), Kind::Yield | Kind::Let | Kind::Static) - { - TokenType::new("Keyword") - } else { - TokenType::new("Identifier") - }; +impl UpdateContext<'_, O> { + /// Advance iterator to the token at `start`, converting spans along the way. + /// Skipped tokens are not modified (they already have the correct kind), + /// but their spans are converted from UTF-8 to UTF-16. + fn advance_to(&mut self, start: u32) -> &mut Token { + let Self { tokens, span_converter, .. } = self; + for token in &mut *tokens { + debug_assert!( + token.start() <= start, + "Expected token at position {start}, found token at position {}", + token.start(), + ); + + let is_target = token.start() == start; + + // Convert span from UTF-8 byte offsets to UTF-16 offsets + if let Some(converter) = span_converter { + let mut span = token.span(); + converter.convert_span(&mut span); + token.set_span(span); + } - // `identifier.name` has escapes decoded by the parser, and is JSON-safe. - // Use it in most cases — if token is not marked as escaped, it's JSON-safe, so can skip JSON encoding. - // When `self.options.decode_identifier_escapes` is `true`, token `value` should *always* be - // the unescaped version, so can also use `name` from AST node and skip JSON encoding. - // Only fall back to raw source text when the token contains escapes *and* decoding is disabled, - // since escape sequences contain `\` which needs JSON escaping. - // Escaped identifiers are extremely rare, so handle them in `#[cold]` branch. - if self.options.is_js() || !token.escaped() { - self.serialize_safe_token(token, token_type, name); - } else { - #[cold] - #[inline(never)] - fn emit( - ctx: &mut ESTreeTokenContext<'_, O, S>, - token: &Token, - token_type: TokenType, - ) { - ctx.emit_unsafe_token(token, token_type); + if is_target { + return token; } - emit(self, token, token_type); } + unreachable!("Expected token at position {start}"); } - /// Emit the `this` keyword at `start` as `Identifier`. - /// Used for `this` in TS type queries and TS `this` parameters. - fn emit_this_identifier_at(&mut self, start: u32) { - self.emit_safe_token_at(start, TokenType::new("Identifier"), "this"); + /// Convert remaining token spans from UTF-8 byte offsets to UTF-16 offsets. + fn finish(self) { + if let Some(mut converter) = self.span_converter { + for token in self.tokens { + let mut span = token.span(); + converter.convert_span(&mut span); + token.set_span(span); + } + } } +} - /// Emit the token at `start` as `JSXIdentifier`. - /// JSX identifier names are guaranteed JSON-safe (no unicode escapes, no special characters). - fn emit_jsx_identifier_at(&mut self, start: u32, name: &str) { - self.emit_safe_token_at(start, TokenType::new("JSXIdentifier"), name); +impl Context for UpdateContext<'_, O> { + /// JSX state type for tracking when to emit JSX identifiers. + /// Inherited from config. + type JSXState = O::JSXState; + + /// Returns `true` if serializing in TS style. + #[expect(clippy::inline_always)] + #[inline(always)] + fn is_ts(&self) -> bool { + self.options.is_ts() } - /// Emit the token at `start` as the specified token type, - /// where the token's `value` is guaranteed JSON-safe. - fn emit_safe_token_at(&mut self, start: u32, token_type: TokenType, value: &str) { - let token = self.advance_to(start); - self.serialize_safe_token(token, token_type, value); + /// Get reference to [`JSXState`] for the serializer/updater. + #[expect(clippy::inline_always)] + #[inline(always)] + fn jsx_state(&self) -> &Self::JSXState { + &self.jsx_state } - /// Emit the token at `start` as the specified token type, - /// where the token's `value` may not be JSON-safe. - fn emit_unsafe_token_at(&mut self, start: u32, token_type: TokenType) { - let token = self.advance_to(start); - self.emit_unsafe_token(token, token_type); + /// Get mutable reference to [`JSXState`] for the serializer/updater. + #[expect(clippy::inline_always)] + #[inline(always)] + fn jsx_state_mut(&mut self) -> &mut Self::JSXState { + &mut self.jsx_state } - /// Consume all tokens before `start` (emitting them with default types), - /// and return the token at `start`. - /// - /// The returned token does not have its `Span` converted to UTF-16 before returning it. - /// - /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types - /// (strings, templates, regexes, JSXText) are dealt with by their own visitors. - fn advance_to(&mut self, start: u32) -> &'b Token { - while let Some(token) = self.tokens.next() { - if token.start() < start { - self.emit_safe_token(token); - } else { - debug_assert_eq!( - token.start(), - start, - "Expected token at position {start}, found token at position {}", - token.start(), - ); - return token; - } + /// Set `Kind` of the token at `start` to `Identifier`. + /// In JS mode, if it's a `yield`, `let`, or `static` keyword, leave it as a `Keyword` token. + fn emit_identifier_at(&mut self, start: u32, _name: &str) { + let is_ts = self.is_ts(); + + let token = self.advance_to(start); + + // In JS style, `yield` / `let` / `static` used as identifiers should remain as keywords + if is_ts || !matches!(token.kind(), Kind::Yield | Kind::Let | Kind::Static) { + token.set_kind(Kind::Ident); } - unreachable!("Expected token at position {start}"); } - /// Serialize a single token using its raw source text, skipping JSON encoding. - /// - /// Used for tokens whose values are guaranteed JSON-safe - /// (punctuators, keywords, numbers, booleans, `null`). - /// - /// The token's type is determined by its `Kind`. - fn emit_safe_token(&mut self, token: &Token) { - let token_type = get_token_type(token.kind()); - let value = &self.source_text[token.start() as usize..token.end() as usize]; - self.serialize_safe_token(token, token_type, value); + /// Set `Kind` of the token at `start` to `Identifier`. + fn emit_this_identifier_at(&mut self, start: u32) { + let token = self.advance_to(start); + token.set_kind(Kind::Ident); } - /// Serialize a single token using its raw source text, with JSON encoding. - /// Used for tokens whose values may contain backslashes, quotes, or control characters - /// (escaped identifiers, string literals, template literals, JSXText). - fn emit_unsafe_token(&mut self, token: &Token, token_type: TokenType) { - let value = &self.source_text[token.start() as usize..token.end() as usize]; - self.serialize_unsafe_token(token, token_type, value); + /// Set `Kind` of the token at `start` to `JSXIdentifier`. + fn emit_jsx_identifier_at(&mut self, start: u32, _name: &str) { + let token = self.advance_to(start); + token.set_kind(Kind::JSXIdentifier); } - /// Serialize a token whose value is guaranteed JSON-safe, skipping JSON-encoding. - fn serialize_safe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { - let span = self.get_utf16_span(token); - self.seq.serialize_element(&ESTreeSafeToken { token_type, value, span }); + /// Handle `PrivateIdentifier` token at `start` (no-op). + #[inline(always)] + fn emit_private_identifier_at(&mut self, _start: u32, _name: &str) { + // No-op: token already has `Kind::PrivateIdentifier`. + // The iterator will skip past this token on the next `advance_to` call. } - /// Serialize a token whose value may not be JSON-safe. - fn serialize_unsafe_token(&mut self, token: &Token, token_type: TokenType, value: &str) { - let span = self.get_utf16_span(token); - self.seq.serialize_element(&ESTreeUnsafeToken { token_type, value, span }); + /// Set `Kind` of the token at `start` to `JSXText`. + fn emit_jsx_text_at(&mut self, start: u32) { + let token = self.advance_to(start); + token.set_kind(Kind::JSXText); } - /// Get UTF-16 span for a token. - fn get_utf16_span(&mut self, token: &Token) -> Span { - let mut span = Span::new(token.start(), token.end()); - if let Some(converter) = self.span_converter.as_mut() { - converter.convert_span(&mut span); - } - span - } + /// Handle token at `start` (no-op). + #[inline(always)] + fn emit_unsafe_token_at(&mut self, _start: u32, _token_type: TokenType) {} - /// Serialize all remaining tokens and close the sequence. - /// - /// Tokens emitted here are guaranteed JSON-safe because all non-JSON-safe token types - /// (escaped identifiers, strings, templates, regexes, JSXText) are consumed by their own visitors. - fn finish(mut self) { - while let Some(token) = self.tokens.next() { - self.emit_safe_token(token); + /// Handle `RegExpLiteral` (no-op). + #[inline(always)] + fn emit_regexp(&mut self, _regexp: &RegExpLiteral<'_>) {} + + /// Walk template interpolations (expressions or TS types). + fn walk_template_quasis_interleaved( + visitor: &mut Visitor, + _quasis: &[TemplateElement<'_>], + mut visit_interpolation: impl FnMut(&mut Visitor, &I), + interpolations: &[I], + ) { + // Quasis don't need kind changes, so skip them and only visit interpolations + for interpolation in interpolations { + visit_interpolation(visitor, interpolation); } - self.seq.end(); } } -impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenContext<'_, O, S> { +// ============================================================================================== +// `Visitor` — the visitor +// ============================================================================================== + +/// Visitor that walks the AST and delegates token processing to a [`Context`]. +/// +/// AST visitation is in source order, matching the order of tokens in the iterator. +/// +/// Tokens are consumed from `tokens` iterator in source order. When visitor method encounters +/// an AST node that requires a token type override, all preceding tokens are emitted +/// with their default types, then the overridden token is emitted with its corrected type. +/// After the AST walk, any remaining tokens are emitted with default types. +/// +/// This wrapper is needed because Rust's orphan rules prevent implementing the foreign [`Visit`] trait +/// directly on [`Context`] implementors (which are generic over `O: ESTreeTokenConfig`). +/// `Visitor` is a local type, so it can implement [`Visit`]. +#[repr(transparent)] +struct Visitor { + ctx: C, +} + +impl<'a, C: Context> Visit<'a> for Visitor { fn visit_ts_type_name(&mut self, type_name: &TSTypeName<'a>) { // `this` is emitted as `Identifier` token instead of `Keyword` match type_name { TSTypeName::ThisExpression(this_expr) => { - self.emit_this_identifier_at(this_expr.span.start); + self.ctx.emit_this_identifier_at(this_expr.span.start); } TSTypeName::IdentifierReference(ident) => { self.visit_identifier_reference(ident); @@ -380,7 +797,7 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC fn visit_ts_import_type(&mut self, import_type: &TSImportType<'a>) { // Manual walk. // * `source` is a `StringLiteral` — visit to ensure it's emitted with JSON encoding - // (string values are not JSON-safe). + // (string values are not JSON-safe). No-op in update mode. // * `options` is an `ObjectExpression`. Manually walk each property, but don't visit the key if it's `with`, // as it needs to remain a `Keyword` token, not get converted to `Identifier`. // * `qualifier` and `type_arguments` are visited as usual. @@ -416,72 +833,39 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC } fn visit_identifier_name(&mut self, identifier: &IdentifierName<'a>) { - if self.options.is_ts() && self.jsx_state.should_emit_jsx_identifier() { - self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); + if self.ctx.is_ts() && self.ctx.jsx_state().should_emit_jsx_identifier() { + self.ctx.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { - self.emit_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_identifier_at(identifier.span.start, &identifier.name); } } fn visit_identifier_reference(&mut self, identifier: &IdentifierReference<'a>) { - if self.options.is_ts() && self.jsx_state.should_emit_jsx_identifier() { - self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); + if self.ctx.is_ts() && self.ctx.jsx_state().should_emit_jsx_identifier() { + self.ctx.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { - self.emit_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_identifier_at(identifier.span.start, &identifier.name); } } fn visit_binding_identifier(&mut self, identifier: &BindingIdentifier<'a>) { - self.emit_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_identifier_at(identifier.span.start, &identifier.name); } fn visit_label_identifier(&mut self, identifier: &LabelIdentifier<'a>) { - self.emit_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_identifier_at(identifier.span.start, &identifier.name); } fn visit_private_identifier(&mut self, identifier: &PrivateIdentifier<'a>) { - let token = self.advance_to(identifier.span.start); - - // `identifier.name` has `#` stripped and escapes decoded by the parser, and is JSON-safe. - // Use it in most cases — if token is not marked as escaped, it's JSON-safe, so can skip JSON encoding. - // When `self.is_js()` is `true`, token `value` should *always* be the unescaped version, - // so can also use `name` from AST node and skip JSON encoding. - // Only fall back to raw source text when the token contains escapes *and* decoding is disabled, - // since escape sequences contain `\` which needs JSON escaping. - // Escaped identifiers are extremely rare, so handle them in `#[cold]` branch. - if self.options.is_js() || !token.escaped() { - self.serialize_safe_token(token, TokenType::new("PrivateIdentifier"), &identifier.name); - } else { - #[cold] - #[inline(never)] - fn emit( - ctx: &mut ESTreeTokenContext<'_, O, S>, - token: &Token, - ) { - // Strip leading `#` - let value = &ctx.source_text[token.start() as usize + 1..token.end() as usize]; - ctx.serialize_unsafe_token(token, TokenType::new("PrivateIdentifier"), value); - } - emit(self, token); - } + self.ctx.emit_private_identifier_at(identifier.span.start, &identifier.name); } fn visit_reg_exp_literal(&mut self, regexp: &RegExpLiteral<'a>) { - let token = self.advance_to(regexp.span.start); - - let value = regexp.raw.as_deref().unwrap(); - let pattern = regexp.regex.pattern.text.as_str(); - - // Flags start after opening `/`, pattern, and closing `/` - let flags = &value[pattern.len() + 2..]; - let regex = RegExpData { pattern, flags }; - - let span = self.get_utf16_span(token); - self.seq.serialize_element(&ESTreeRegExpToken { value, regex, span }); + self.ctx.emit_regexp(regexp); } fn visit_ts_this_parameter(&mut self, parameter: &TSThisParameter<'a>) { - self.emit_this_identifier_at(parameter.this_span.start); + self.ctx.emit_this_identifier_at(parameter.this_span.start); walk::walk_ts_this_parameter(self, parameter); } @@ -528,12 +912,12 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC } fn visit_jsx_identifier(&mut self, identifier: &JSXIdentifier<'a>) { - self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } fn visit_jsx_element_name(&mut self, name: &JSXElementName<'a>) { if let JSXElementName::IdentifierReference(identifier) = name { - self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { walk::walk_jsx_element_name(self, name); } @@ -541,51 +925,54 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC fn visit_jsx_member_expression_object(&mut self, object: &JSXMemberExpressionObject<'a>) { if let JSXMemberExpressionObject::IdentifierReference(identifier) = object { - self.emit_jsx_identifier_at(identifier.span.start, &identifier.name); + self.ctx.emit_jsx_identifier_at(identifier.span.start, &identifier.name); } else { walk::walk_jsx_member_expression_object(self, object); } } fn visit_jsx_namespaced_name(&mut self, name: &JSXNamespacedName<'a>) { - if self.options.is_js() { - self.emit_jsx_identifier_at(name.namespace.span.start, &name.namespace.name); - self.emit_jsx_identifier_at(name.name.span.start, &name.name.name); + if self.ctx.is_js() { + self.ctx.emit_jsx_identifier_at(name.namespace.span.start, &name.namespace.name); + self.ctx.emit_jsx_identifier_at(name.name.span.start, &name.name.name); } else { // In TS mode, these tokens retain their default type (`Identifier`) } } fn visit_jsx_expression_container(&mut self, container: &JSXExpressionContainer<'a>) { - self.jsx_state.enter_jsx_expression(); + self.ctx.jsx_state_mut().enter_jsx_expression(); walk::walk_jsx_expression_container(self, container); - self.jsx_state.exit_jsx_expression(); + self.ctx.jsx_state_mut().exit_jsx_expression(); } fn visit_member_expression(&mut self, member_expr: &MemberExpression<'a>) { - self.jsx_state.enter_member_expression(member_expr); + self.ctx.jsx_state_mut().enter_member_expression(member_expr); walk::walk_member_expression(self, member_expr); - self.jsx_state.exit_member_expression(member_expr); + self.ctx.jsx_state_mut().exit_member_expression(member_expr); } fn visit_jsx_spread_attribute(&mut self, attribute: &JSXSpreadAttribute<'a>) { - self.jsx_state.enter_jsx_expression(); + self.ctx.jsx_state_mut().enter_jsx_expression(); walk::walk_jsx_spread_attribute(self, attribute); - self.jsx_state.exit_jsx_expression(); + self.ctx.jsx_state_mut().exit_jsx_expression(); } fn visit_jsx_spread_child(&mut self, spread_child: &JSXSpreadChild<'a>) { - self.jsx_state.enter_jsx_expression(); + self.ctx.jsx_state_mut().enter_jsx_expression(); walk::walk_jsx_spread_child(self, spread_child); - self.jsx_state.exit_jsx_expression(); + self.ctx.jsx_state_mut().exit_jsx_expression(); } fn visit_string_literal(&mut self, literal: &StringLiteral<'a>) { - self.emit_unsafe_token_at(literal.span.start, TokenType::new("String")); + // No-op in update mode - token's `Kind` is already `String` + self.ctx.emit_unsafe_token_at(literal.span.start, TokenType::new("String")); } fn visit_jsx_text(&mut self, text: &JSXText<'a>) { - self.emit_unsafe_token_at(text.span.start, TokenType::new("JSXText")); + // Use `emit_unsafe_token_at` not `emit_jsx_text_at`, as the token's `Kind` is already `JSXText`, + // so no-op in update mode + self.ctx.emit_unsafe_token_at(text.span.start, TokenType::new("JSXText")); } fn visit_jsx_attribute(&mut self, attribute: &JSXAttribute<'a>) { @@ -595,7 +982,9 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC self.visit_jsx_attribute_name(&attribute.name); match &attribute.value { Some(JSXAttributeValue::StringLiteral(string_literal)) => { - self.emit_unsafe_token_at(string_literal.span.start, TokenType::new("JSXText")); + // Use `emit_jsx_text_at` not `emit_unsafe_token_at`, as the token `Kind` + // needs to be updated to `JSXText` in update mode + self.ctx.emit_jsx_text_at(string_literal.span.start); } Some(value) => self.visit_jsx_attribute_value(value), None => {} @@ -603,11 +992,8 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC } fn visit_template_literal(&mut self, literal: &TemplateLiteral<'a>) { - // Manual walk. - // Interleave quasis and expressions in source order. - // The default walk visits all quasis first, then all expressions, - // which would break `advance_to`'s source-order token consumption. - self.emit_template_quasis_interleaved( + C::walk_template_quasis_interleaved( + self, &literal.quasis, Visit::visit_expression, &literal.expressions, @@ -615,66 +1001,11 @@ impl<'a, O: ESTreeTokenConfig, S: SequenceSerializer> Visit<'a> for ESTreeTokenC } fn visit_ts_template_literal_type(&mut self, literal: &TSTemplateLiteralType<'a>) { - // Same as `visit_template_literal` but with TS types instead of expressions - self.emit_template_quasis_interleaved( + C::walk_template_quasis_interleaved( + self, &literal.quasis, Visit::visit_ts_type, &literal.types, ); } } - -impl ESTreeTokenContext<'_, O, S> { - /// Emit template quasis interleaved with their interpolated parts (expressions or TS types). - /// - /// `TemplateElement.span` excludes delimiters (parser adjusts `start + 1`), - /// so subtract 1 to get the token start position. - fn emit_template_quasis_interleaved( - &mut self, - quasis: &[TemplateElement<'_>], - mut visit_interpolation: impl FnMut(&mut Self, &I), - interpolations: &[I], - ) { - let mut quasis = quasis.iter(); - - // First quasi (TemplateHead or NoSubstitutionTemplate) - if let Some(quasi) = quasis.next() { - self.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); - } - - // Remaining quasis interleaved with interpolations - for (interpolation, quasi) in interpolations.iter().zip(quasis) { - visit_interpolation(self, interpolation); - self.emit_unsafe_token_at(quasi.span.start - 1, TokenType::new("Template")); - } - } -} - -/// Get `TokenType` for a token `Kind`. -fn get_token_type(kind: Kind) -> TokenType { - // Tokens with these `Kind`s are always consumed by specific visitors and should never reach here - debug_assert!( - !matches!( - kind, - Kind::Str - | Kind::RegExp - | Kind::JSXText - | Kind::PrivateIdentifier - | Kind::NoSubstitutionTemplate - | Kind::TemplateHead - | Kind::TemplateMiddle - | Kind::TemplateTail - ), - "Token kind {kind:?} should be consumed by its visitor, and not reach `get_token_type`", - ); - - match kind { - Kind::Ident | Kind::Await => TokenType::new("Identifier"), - Kind::True | Kind::False => TokenType::new("Boolean"), - Kind::Null => TokenType::new("Null"), - _ if kind.is_number() => TokenType::new("Numeric"), - _ if kind.is_contextual_keyword() => TokenType::new("Identifier"), - _ if kind.is_any_keyword() => TokenType::new("Keyword"), - _ => TokenType::new("Punctuator"), - } -} diff --git a/crates/oxc_parser/src/lexer/kind.rs b/crates/oxc_parser/src/lexer/kind.rs index f2b5eb1a32400..994db643b4e6b 100644 --- a/crates/oxc_parser/src/lexer/kind.rs +++ b/crates/oxc_parser/src/lexer/kind.rs @@ -199,6 +199,9 @@ pub enum Kind { PrivateIdentifier, // JSX JSXText, + // `JSXIdentifier` tokens are never produced by lexer. + // Only used in ESTree conversion. + JSXIdentifier, // Decorator At, } @@ -675,6 +678,7 @@ impl Kind { TemplateTail => "}", PrivateIdentifier => "#identifier", JSXText => "jsx", + JSXIdentifier => "jsx_identifier", At => "@", Assert => "assert", Any => "any", diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs index a5931c2150f32..c7bfc5b6f7cc4 100644 --- a/tasks/benchmark/benches/parser.rs +++ b/tasks/benchmark/benches/parser.rs @@ -1,7 +1,7 @@ use oxc_allocator::Allocator; use oxc_ast_visit::utf8_to_utf16::Utf8ToUtf16; use oxc_benchmark::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; -use oxc_estree_tokens::{ESTreeTokenOptionsJS, to_estree_tokens_json}; +use oxc_estree_tokens::{ESTreeTokenOptionsJS, to_estree_tokens_json, update_tokens}; use oxc_parser::{ParseOptions, Parser, ParserReturn, config::RuntimeParserConfig}; use oxc_tasks_common::TestFiles; @@ -186,23 +186,16 @@ fn bench_estree_tokens_raw(criterion: &mut Criterion) { }) .with_config(config) .parse(); - let ParserReturn { program, tokens, .. } = ret; + let ParserReturn { program, mut tokens, .. } = ret; // Creating span converter is not performed in measured section, as we only want to measure tokens. // Span converter needs to be created anyway for converting spans in AST. let span_converter = Utf8ToUtf16::new(program.source_text); runner.run(|| { - let tokens_json = to_estree_tokens_json( - &tokens, - &program, - program.source_text, - &span_converter, - ESTreeTokenOptionsJS, - ); - let tokens_json = black_box(tokens_json); - // Allocate tokens JSON into arena, same as linter and NAPI parser package do - let _tokens_json = allocator.alloc_str(&tokens_json); + update_tokens(&mut tokens, &program, &span_converter, ESTreeTokenOptionsJS); + + black_box(tokens); program });