From b3b2d305f99dff525eec436f46ee2c3993fdc861 Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Tue, 24 Feb 2026 02:54:46 +0000 Subject: [PATCH] perf(parser): introduce `ParserConfig` (#19637) ### What this PR does Introduce `ParserConfig` trait (another try at #16785). The aim is to remove the large performance regression in parser that #19497 created, by making whether the parser generates tokens or not a compile-time option. `ParserConfig::tokens` method replaces `ParseOptions::collect_tokens` property. The former can be const-folded at compile time, where the latter couldn't. ### 3 options This PR also introduces 3 different config types that users can pass to the parser: * `NoTokensParserConfig` (default) * `TokensParserConfig` * `RuntimeParserConfig` The first 2 set whether tokens are collected or not at compile time. The last sets it at runtime. All 3 implement `ParserConfig`. `NoTokensParserConfig` is the default, and is what's used in compiler pipeline. It switches tokens off in the parser, and makes all the tokens-related code dead code, which the compiler eliminates. This makes the ability of the parser to generate tokens zero cost when it's not used (in the compiler pipeline). `TokensParserConfig` is the one to use where you *always* want tokens. This is probably the config that linter will use. `RuntimeParserConfig` is the one to use when an application decides whether to generate tokens or not at runtime. This config avoids compiling the parser twice, at the cost of runtime checks. This is what NAPI parser package will use. ### Future extension #### Supporting additional features In future we intend to build the UTF-8 to UTF-16 offsets conversion table in the parser. This will be more performant than searching through the source text for unicode characters in a 2nd pass later on. But this feature is only required for uses of the parser where we're interacting with JS side (NAPI parser package, linter with JS plugins). `ParserConfig` can be extended to toggle this feature on/off at compile time or runtime, in the same way as you toggle on/off tokens. #### Options and configs This PR introduces `ParserConfig` but leaves `ParseOptions` as it is. So we now have 2 sets of options, passed to `Parser` with `with_options(...)` and `with_config(...)`. This is confusing. We could merge the 2 by making `ParseOptions` implement `ParserConfig`, so then you'd define all options with one `with_options` call. This would have the side effect of making all other parser options (e.g. `preserve_parens`) able to be set at either runtime or compile time, depending on the use case. For users consuming `oxc_parser` as a library, with specific needs, they could also configure `Parser` to their needs e.g. create a parser which only handles plain JS code with all the code paths for JSX and TS shaken out as dead code. This would likely improve parsing speed significantly for these use cases. ### Implementation details Why a trait instead of a cargo feature? IMO a trait is preferable for the following reasons: 1. We have 3 different use cases we need to support (the 3 provided configs). 3 different Cargo features would be unwieldy. 2. This situation would become far worse once we introduce more features e.g. UTF-8 -> UTF-16 conversion. 3. No problems around feature unification. We found Cargo features caused headaches when we used them in linter for toggling on/off JS plugins support. 4. No clippy errors which only appear when the feature is/isn't disabled, requiring complex `#[cfg_attr(feature = "whatever", expect(clippy::unused_async))]` etc. The introduction of a trait does not seem to significantly affect compile time: ``` # Before this PR cargo build -p oxc_parser 15.33s user 3.27s system 254% cpu 7.316 total cargo build -p oxc_parser --release 17.36s user 2.26s system 231% cpu 8.477 total cargo build -p oxc_parser --example parser 18.43s user 3.75s system 271% cpu 8.156 total cargo build -p oxc_parser --example parser --release 32.52s user 2.59s system 180% cpu 19.454 total # After this PR cargo build -p oxc_parser 15.00s user 3.24s system 272% cpu 6.692 total cargo build -p oxc_parser --release 16.71s user 2.12s system 287% cpu 6.539 total cargo build -p oxc_parser --example parser 18.50s user 3.91s system 285% cpu 7.845 total cargo build -p oxc_parser --example parser --release 33.48s user 2.63s system 169% cpu 21.263 total ``` Measured on Mac Mini M4 Pro, `cargo clean` run before each. The difference appears to be mostly within the noise threshold. --- crates/oxc_formatter/src/service/mod.rs | 1 - crates/oxc_parser/src/config.rs | 166 +++++++++++++++++++ crates/oxc_parser/src/cursor.rs | 8 +- crates/oxc_parser/src/error_handler.rs | 6 +- crates/oxc_parser/src/js/arrow.rs | 4 +- crates/oxc_parser/src/js/binding.rs | 4 +- crates/oxc_parser/src/js/class.rs | 4 +- crates/oxc_parser/src/js/declaration.rs | 4 +- crates/oxc_parser/src/js/expression.rs | 4 +- crates/oxc_parser/src/js/function.rs | 4 +- crates/oxc_parser/src/js/grammar.rs | 36 ++-- crates/oxc_parser/src/js/module.rs | 4 +- crates/oxc_parser/src/js/object.rs | 4 +- crates/oxc_parser/src/js/statement.rs | 4 +- crates/oxc_parser/src/jsx/mod.rs | 4 +- crates/oxc_parser/src/lexer/byte_handlers.rs | 81 +++++---- crates/oxc_parser/src/lexer/comment.rs | 4 +- crates/oxc_parser/src/lexer/identifier.rs | 4 +- crates/oxc_parser/src/lexer/jsx.rs | 4 +- crates/oxc_parser/src/lexer/mod.rs | 20 ++- crates/oxc_parser/src/lexer/numeric.rs | 4 +- crates/oxc_parser/src/lexer/punctuation.rs | 5 +- crates/oxc_parser/src/lexer/regex.rs | 4 +- crates/oxc_parser/src/lexer/string.rs | 4 +- crates/oxc_parser/src/lexer/template.rs | 15 +- crates/oxc_parser/src/lexer/typescript.rs | 4 +- crates/oxc_parser/src/lexer/unicode.rs | 4 +- crates/oxc_parser/src/lexer/whitespace.rs | 4 +- crates/oxc_parser/src/lib.rs | 48 ++++-- crates/oxc_parser/src/modifiers.rs | 8 +- crates/oxc_parser/src/ts/statement.rs | 4 +- crates/oxc_parser/src/ts/types.rs | 4 +- napi/playground/src/lib.rs | 1 - tasks/benchmark/benches/lexer.rs | 12 +- tasks/coverage/src/tools.rs | 16 +- 35 files changed, 361 insertions(+), 146 deletions(-) create mode 100644 crates/oxc_parser/src/config.rs diff --git a/crates/oxc_formatter/src/service/mod.rs b/crates/oxc_formatter/src/service/mod.rs index 010ca3527a65e..2ade92da4fccd 100644 --- a/crates/oxc_formatter/src/service/mod.rs +++ b/crates/oxc_formatter/src/service/mod.rs @@ -11,7 +11,6 @@ pub fn get_parse_options() -> ParseOptions { allow_v8_intrinsics: true, // `oxc_formatter` expects this to be `false`, otherwise panics preserve_parens: false, - collect_tokens: false, } } diff --git a/crates/oxc_parser/src/config.rs b/crates/oxc_parser/src/config.rs new file mode 100644 index 0000000000000..e13b09eb5dfa3 --- /dev/null +++ b/crates/oxc_parser/src/config.rs @@ -0,0 +1,166 @@ +// All methods are `#[inline(always)]` to ensure compiler removes dead code resulting from static values +#![expect(clippy::inline_always)] + +use std::ops::Index; + +use crate::lexer::{ByteHandler, ByteHandlers, byte_handler_tables}; + +/// Parser config. +/// +/// The purpose of parser config (as opposed to `ParseOptions`) is to allow setting options at either +/// compile time or runtime. +/// +/// 3 configs are provided: +/// * [`NoTokensParserConfig`]: Parse without tokens, static (default) +/// * [`TokensParserConfig`]: Parse with tokens, static +/// * [`RuntimeParserConfig`]: Parse with or without tokens, decided at runtime +/// +/// The trade-off is: +/// +/// * The 2 static configs will produce better performance, because compiler can remove code that relates +/// to the other option as dead code, and remove branches. +/// +/// * The runtime config will produce a smaller binary than using 2 different configs in the same application, +/// which would cause 2 polymorphic variants of the parser to be compiled. +/// +/// Advised usage: +/// * If your application uses only a specific set of options, use a static config. +/// * If your application uses multiple sets of options, probably a runtime config is preferable. +/// +/// At present the only option controlled by `ParserConfig` is whether to parse with or without tokens. +/// Other options will be added in future. +/// +/// You can also create your own config by implementing [`ParserConfig`] on a type. +pub trait ParserConfig: Default { + type LexerConfig: LexerConfig; + + fn lexer_config(&self) -> Self::LexerConfig; +} + +/// Parser config for parsing without tokens (default). +/// +/// See [`ParserConfig`] for more details. +#[derive(Copy, Clone, Default)] +pub struct NoTokensParserConfig; + +impl ParserConfig for NoTokensParserConfig { + type LexerConfig = NoTokensLexerConfig; + + #[inline(always)] + fn lexer_config(&self) -> NoTokensLexerConfig { + NoTokensLexerConfig + } +} + +/// Parser config for parsing with tokens. +/// +/// See [`ParserConfig`] for more details. +#[derive(Copy, Clone, Default)] +pub struct TokensParserConfig; + +impl ParserConfig for TokensParserConfig { + type LexerConfig = TokensLexerConfig; + + #[inline(always)] + fn lexer_config(&self) -> TokensLexerConfig { + TokensLexerConfig + } +} + +/// Parser config for parsing with/without tokens, decided at runtime. +/// +/// See [`ParserConfig`] for more details. +#[derive(Copy, Clone, Default)] +#[repr(transparent)] +pub struct RuntimeParserConfig { + lexer_config: RuntimeLexerConfig, +} + +impl RuntimeParserConfig { + #[inline(always)] + pub fn new(tokens: bool) -> Self { + Self { lexer_config: RuntimeLexerConfig::new(tokens) } + } +} + +impl ParserConfig for RuntimeParserConfig { + type LexerConfig = RuntimeLexerConfig; + + #[inline(always)] + fn lexer_config(&self) -> RuntimeLexerConfig { + self.lexer_config + } +} + +/// Lexer config. +pub trait LexerConfig: Default { + type ByteHandlers: Index>; + + fn tokens(&self) -> bool; + + fn byte_handlers(&self) -> &Self::ByteHandlers; +} + +/// Lexer config for lexing without tokens. +#[derive(Copy, Clone, Default)] +pub struct NoTokensLexerConfig; + +impl LexerConfig for NoTokensLexerConfig { + type ByteHandlers = ByteHandlers; + + #[inline(always)] + fn tokens(&self) -> bool { + false + } + + #[inline(always)] + fn byte_handlers(&self) -> &Self::ByteHandlers { + &byte_handler_tables::NO_TOKENS + } +} + +/// Lexer config for parsing with tokens. +#[derive(Copy, Clone, Default)] +pub struct TokensLexerConfig; + +impl LexerConfig for TokensLexerConfig { + type ByteHandlers = ByteHandlers; + + #[inline(always)] + fn tokens(&self) -> bool { + true + } + + #[inline(always)] + fn byte_handlers(&self) -> &Self::ByteHandlers { + &byte_handler_tables::WITH_TOKENS + } +} + +/// Lexer config for lexing with/without tokens, decided at runtime. +#[derive(Copy, Clone, Default)] +#[repr(transparent)] +pub struct RuntimeLexerConfig { + tokens: bool, +} + +impl RuntimeLexerConfig { + #[inline(always)] + pub fn new(tokens: bool) -> Self { + Self { tokens } + } +} + +impl LexerConfig for RuntimeLexerConfig { + type ByteHandlers = ByteHandlers; + + #[inline(always)] + fn tokens(&self) -> bool { + self.tokens + } + + #[inline(always)] + fn byte_handlers(&self) -> &Self::ByteHandlers { + &byte_handler_tables::RUNTIME_TOKENS + } +} diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index 48b500a213272..df7189dba0965 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -6,7 +6,7 @@ use oxc_diagnostics::OxcDiagnostic; use oxc_span::{GetSpan, Span}; use crate::{ - Context, ParserImpl, diagnostics, + Context, ParserConfig as Config, ParserImpl, diagnostics, error_handler::FatalError, lexer::{Kind, LexerCheckpoint, LexerContext, Token}, }; @@ -20,7 +20,7 @@ pub struct ParserCheckpoint<'a> { fatal_error: Option, } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { #[inline] pub(crate) fn start_span(&self) -> u32 { self.token.start() @@ -327,7 +327,7 @@ impl<'a> ParserImpl<'a> { pub(crate) fn try_parse( &mut self, - func: impl FnOnce(&mut ParserImpl<'a>) -> T, + func: impl FnOnce(&mut ParserImpl<'a, C>) -> T, ) -> Option { let checkpoint = self.checkpoint_with_error_recovery(); let ctx = self.ctx; @@ -341,7 +341,7 @@ impl<'a> ParserImpl<'a> { } } - pub(crate) fn lookahead(&mut self, predicate: impl Fn(&mut ParserImpl<'a>) -> U) -> U { + pub(crate) fn lookahead(&mut self, predicate: impl Fn(&mut ParserImpl<'a, C>) -> U) -> U { let checkpoint = self.checkpoint(); let answer = predicate(self); self.rewind(checkpoint); diff --git a/crates/oxc_parser/src/error_handler.rs b/crates/oxc_parser/src/error_handler.rs index 29e444134fd0f..29da0f92f9339 100644 --- a/crates/oxc_parser/src/error_handler.rs +++ b/crates/oxc_parser/src/error_handler.rs @@ -4,7 +4,7 @@ use oxc_allocator::Dummy; use oxc_diagnostics::OxcDiagnostic; use oxc_span::Span; -use crate::{ParserImpl, diagnostics, lexer::Kind}; +use crate::{ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind}; /// Fatal parsing error. #[derive(Debug, Clone)] @@ -15,7 +15,7 @@ pub struct FatalError { pub errors_len: usize, } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { #[cold] pub(crate) fn set_unexpected(&mut self) { // The lexer should have reported a more meaningful diagnostic @@ -105,7 +105,7 @@ impl<'a> ParserImpl<'a> { // error, we detect these patterns and provide helpful guidance on how to resolve the conflict. // // Inspired by rust-lang/rust#106242 -impl ParserImpl<'_> { +impl ParserImpl<'_, C> { /// Check if the current position looks like a merge conflict marker. /// /// Detects the following Git conflict markers: diff --git a/crates/oxc_parser/src/js/arrow.rs b/crates/oxc_parser/src/js/arrow.rs index 0ef09bf6993fa..b7419df8e7fb6 100644 --- a/crates/oxc_parser/src/js/arrow.rs +++ b/crates/oxc_parser/src/js/arrow.rs @@ -4,7 +4,7 @@ use oxc_span::{FileExtension, GetSpan}; use oxc_syntax::precedence::Precedence; use super::{FunctionKind, Tristate}; -use crate::{Context, ParserImpl, diagnostics, lexer::Kind}; +use crate::{Context, ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind}; struct ArrowFunctionHead<'a> { type_parameters: Option>>, @@ -14,7 +14,7 @@ struct ArrowFunctionHead<'a> { span: u32, } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(super) fn try_parse_parenthesized_arrow_function_expression( &mut self, allow_return_type_in_arrow_function: bool, diff --git a/crates/oxc_parser/src/js/binding.rs b/crates/oxc_parser/src/js/binding.rs index 0a227336abcd8..24d2670ada0c6 100644 --- a/crates/oxc_parser/src/js/binding.rs +++ b/crates/oxc_parser/src/js/binding.rs @@ -2,9 +2,9 @@ use oxc_allocator::Box; use oxc_ast::ast::*; use oxc_span::GetSpan; -use crate::{Context, ParserImpl, diagnostics, lexer::Kind}; +use crate::{Context, ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind}; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { /// `BindingElement` /// `SingleNameBinding` /// `BindingPattern`[?Yield, ?Await] `Initializer`[+In, ?Yield, ?Await]opt diff --git a/crates/oxc_parser/src/js/class.rs b/crates/oxc_parser/src/js/class.rs index 97f4bff014fcb..33d4b4600d3fa 100644 --- a/crates/oxc_parser/src/js/class.rs +++ b/crates/oxc_parser/src/js/class.rs @@ -4,7 +4,7 @@ use oxc_ecmascript::PropName; use oxc_span::{GetSpan, Span}; use crate::{ - Context, ParserImpl, StatementContext, diagnostics, + Context, ParserConfig as Config, ParserImpl, StatementContext, diagnostics, lexer::Kind, modifiers::{ModifierFlags, ModifierKind, Modifiers}, }; @@ -14,7 +14,7 @@ use super::FunctionKind; type ImplementsWithKeywordSpan<'a> = (Span, Vec<'a, TSClassImplements<'a>>); /// Section 15.7 Class Definitions -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { // `start_span` points at the start of all decoractors and `class` keyword. pub(crate) fn parse_class_statement( &mut self, diff --git a/crates/oxc_parser/src/js/declaration.rs b/crates/oxc_parser/src/js/declaration.rs index 3b7323379e59e..de18134af89e7 100644 --- a/crates/oxc_parser/src/js/declaration.rs +++ b/crates/oxc_parser/src/js/declaration.rs @@ -3,9 +3,9 @@ use oxc_ast::ast::*; use oxc_span::GetSpan; use super::VariableDeclarationParent; -use crate::{ParserImpl, StatementContext, diagnostics, lexer::Kind}; +use crate::{ParserConfig as Config, ParserImpl, StatementContext, diagnostics, lexer::Kind}; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn parse_let(&mut self, stmt_ctx: StatementContext) -> Statement<'a> { let span = self.start_span(); diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index e89b0db64e5cb..1d61b87bf02e5 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -17,12 +17,12 @@ use super::{ }, }; use crate::{ - Context, ParserImpl, diagnostics, + Context, ParserConfig as Config, ParserImpl, diagnostics, lexer::{Kind, parse_big_int, parse_float, parse_int}, modifiers::Modifiers, }; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn parse_paren_expression(&mut self) -> Expression<'a> { let opening_span = self.cur_token().span(); self.expect(Kind::LParen); diff --git a/crates/oxc_parser/src/js/function.rs b/crates/oxc_parser/src/js/function.rs index 940014f4a41c6..624537c500f8c 100644 --- a/crates/oxc_parser/src/js/function.rs +++ b/crates/oxc_parser/src/js/function.rs @@ -4,7 +4,7 @@ use oxc_span::{GetSpan, Span}; use super::FunctionKind; use crate::{ - Context, ParserImpl, StatementContext, diagnostics, + Context, ParserConfig as Config, ParserImpl, StatementContext, diagnostics, lexer::Kind, modifiers::{ModifierFlags, ModifierKind, Modifiers}, }; @@ -19,7 +19,7 @@ impl FunctionKind { } } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn at_function_with_async(&mut self) -> bool { self.at(Kind::Function) || self.at(Kind::Async) && { diff --git a/crates/oxc_parser/src/js/grammar.rs b/crates/oxc_parser/src/js/grammar.rs index 0692416a1e366..0a1fa549dbddd 100644 --- a/crates/oxc_parser/src/js/grammar.rs +++ b/crates/oxc_parser/src/js/grammar.rs @@ -3,14 +3,14 @@ use oxc_ast::ast::*; use oxc_span::GetSpan; -use crate::{ParserImpl, diagnostics}; +use crate::{ParserConfig as Config, ParserImpl, diagnostics}; -pub trait CoverGrammar<'a, T>: Sized { - fn cover(value: T, p: &mut ParserImpl<'a>) -> Self; +pub trait CoverGrammar<'a, T, C: Config>: Sized { + fn cover(value: T, p: &mut ParserImpl<'a, C>) -> Self; } -impl<'a> CoverGrammar<'a, Expression<'a>> for AssignmentTarget<'a> { - fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, Expression<'a>, C> for AssignmentTarget<'a> { + fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a, C>) -> Self { match expr { Expression::ArrayExpression(array_expr) => { let pat = ArrayAssignmentTarget::cover(array_expr.unbox(), p); @@ -25,8 +25,8 @@ impl<'a> CoverGrammar<'a, Expression<'a>> for AssignmentTarget<'a> { } } -impl<'a> CoverGrammar<'a, Expression<'a>> for SimpleAssignmentTarget<'a> { - fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, Expression<'a>, C> for SimpleAssignmentTarget<'a> { + fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a, C>) -> Self { match expr { Expression::Identifier(ident) => { SimpleAssignmentTarget::AssignmentTargetIdentifier(ident) @@ -90,8 +90,8 @@ impl<'a> CoverGrammar<'a, Expression<'a>> for SimpleAssignmentTarget<'a> { } } -impl<'a> CoverGrammar<'a, ArrayExpression<'a>> for ArrayAssignmentTarget<'a> { - fn cover(expr: ArrayExpression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, ArrayExpression<'a>, C> for ArrayAssignmentTarget<'a> { + fn cover(expr: ArrayExpression<'a>, p: &mut ParserImpl<'a, C>) -> Self { let mut elements = p.ast.vec(); let mut rest = None; @@ -136,8 +136,8 @@ impl<'a> CoverGrammar<'a, ArrayExpression<'a>> for ArrayAssignmentTarget<'a> { } } -impl<'a> CoverGrammar<'a, Expression<'a>> for AssignmentTargetMaybeDefault<'a> { - fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, Expression<'a>, C> for AssignmentTargetMaybeDefault<'a> { + fn cover(expr: Expression<'a>, p: &mut ParserImpl<'a, C>) -> Self { match expr { Expression::AssignmentExpression(assignment_expr) => { if assignment_expr.operator != AssignmentOperator::Assign { @@ -156,14 +156,16 @@ impl<'a> CoverGrammar<'a, Expression<'a>> for AssignmentTargetMaybeDefault<'a> { } } -impl<'a> CoverGrammar<'a, AssignmentExpression<'a>> for AssignmentTargetWithDefault<'a> { - fn cover(expr: AssignmentExpression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, AssignmentExpression<'a>, C> + for AssignmentTargetWithDefault<'a> +{ + fn cover(expr: AssignmentExpression<'a>, p: &mut ParserImpl<'a, C>) -> Self { p.ast.assignment_target_with_default(expr.span, expr.left, expr.right) } } -impl<'a> CoverGrammar<'a, ObjectExpression<'a>> for ObjectAssignmentTarget<'a> { - fn cover(expr: ObjectExpression<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, ObjectExpression<'a>, C> for ObjectAssignmentTarget<'a> { + fn cover(expr: ObjectExpression<'a>, p: &mut ParserImpl<'a, C>) -> Self { let mut properties = p.ast.vec(); let mut rest = None; @@ -203,8 +205,8 @@ impl<'a> CoverGrammar<'a, ObjectExpression<'a>> for ObjectAssignmentTarget<'a> { } } -impl<'a> CoverGrammar<'a, ObjectProperty<'a>> for AssignmentTargetProperty<'a> { - fn cover(property: ObjectProperty<'a>, p: &mut ParserImpl<'a>) -> Self { +impl<'a, C: Config> CoverGrammar<'a, ObjectProperty<'a>, C> for AssignmentTargetProperty<'a> { + fn cover(property: ObjectProperty<'a>, p: &mut ParserImpl<'a, C>) -> Self { if property.shorthand { let binding = match property.key { PropertyKey::StaticIdentifier(ident) => { diff --git a/crates/oxc_parser/src/js/module.rs b/crates/oxc_parser/src/js/module.rs index 40c68e68d6e33..428b49ba5b08c 100644 --- a/crates/oxc_parser/src/js/module.rs +++ b/crates/oxc_parser/src/js/module.rs @@ -5,7 +5,7 @@ use rustc_hash::FxHashMap; use super::FunctionKind; use crate::{ - ParserImpl, diagnostics, + ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind, modifiers::{Modifier, ModifierFlags, ModifierKind, Modifiers}, }; @@ -26,7 +26,7 @@ enum ImportOrExportSpecifier<'a> { Export(ExportSpecifier<'a>), } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { /// [Import Call](https://tc39.es/ecma262/#sec-import-calls) /// `ImportCall` : import ( `AssignmentExpression` ) pub(crate) fn parse_import_expression( diff --git a/crates/oxc_parser/src/js/object.rs b/crates/oxc_parser/src/js/object.rs index 983f39ef3dc61..d5f6da9219081 100644 --- a/crates/oxc_parser/src/js/object.rs +++ b/crates/oxc_parser/src/js/object.rs @@ -3,14 +3,14 @@ use oxc_ast::ast::*; use oxc_syntax::operator::AssignmentOperator; use crate::{ - Context, ParserImpl, diagnostics, + Context, ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind, modifiers::{ModifierFlags, Modifiers}, }; use super::FunctionKind; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { /// [Object Expression](https://tc39.es/ecma262/#sec-object-initializer) /// `ObjectLiteral`[Yield, Await] : /// { } diff --git a/crates/oxc_parser/src/js/statement.rs b/crates/oxc_parser/src/js/statement.rs index bcd3c07860970..42b2a4a6bcbda 100644 --- a/crates/oxc_parser/src/js/statement.rs +++ b/crates/oxc_parser/src/js/statement.rs @@ -4,12 +4,12 @@ use oxc_span::{Atom, GetSpan, Span}; use super::{VariableDeclarationParent, grammar::CoverGrammar}; use crate::{ - Context, ParserImpl, StatementContext, diagnostics, + Context, ParserConfig as Config, ParserImpl, StatementContext, diagnostics, lexer::Kind, modifiers::{Modifier, ModifierFlags, ModifierKind, Modifiers}, }; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { // Section 12 // The InputElementHashbangOrRegExp goal is used at the start of a Script // or Module. diff --git a/crates/oxc_parser/src/jsx/mod.rs b/crates/oxc_parser/src/jsx/mod.rs index da2ab25d520cf..d6ed161b47738 100644 --- a/crates/oxc_parser/src/jsx/mod.rs +++ b/crates/oxc_parser/src/jsx/mod.rs @@ -4,7 +4,7 @@ use oxc_allocator::{Allocator, Box, Dummy, Vec}; use oxc_ast::ast::*; use oxc_span::{Atom, GetSpan, Span}; -use crate::{ParserImpl, diagnostics, lexer::Kind}; +use crate::{ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind}; /// Represents either a closing JSX element or fragment. enum JSXClosing<'a> { @@ -20,7 +20,7 @@ impl<'a> Dummy<'a> for JSXClosing<'a> { } } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn parse_jsx_expression(&mut self) -> Expression<'a> { let span = self.start_span(); self.bump_any(); // bump `<` diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index fa88842794504..ff3efe1668d57 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -1,50 +1,67 @@ use oxc_data_structures::assert_unchecked; -use crate::diagnostics; +use crate::{ + config::{LexerConfig as Config, NoTokensLexerConfig, RuntimeLexerConfig, TokensLexerConfig}, + diagnostics, +}; use super::{Kind, Lexer}; -impl Lexer<'_> { +impl Lexer<'_, C> { /// Handle next byte of source. /// /// # SAFETY /// /// * Lexer must not be at end of file. /// * `byte` must be next byte of source code, corresponding to current position of `lexer.source`. - /// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. + /// * Only byte handlers for ASCII characters may use the `ascii_byte_handler!()` macro. // `#[inline(always)]` to ensure is inlined into `read_next_token` #[expect(clippy::inline_always)] #[inline(always)] pub(super) unsafe fn handle_byte(&mut self, byte: u8) -> Kind { + let byte_handlers = self.config.byte_handlers(); // SAFETY: Caller guarantees to uphold safety invariants - unsafe { BYTE_HANDLERS[byte as usize](self) } + unsafe { byte_handlers[byte as usize](self) } } } -type ByteHandler = unsafe fn(&mut Lexer<'_>) -> Kind; +pub type ByteHandler = unsafe fn(&mut Lexer<'_, C>) -> Kind; +pub type ByteHandlers = [ByteHandler; 256]; -/// Lookup table mapping any incoming byte to a handler function defined below. +/// Macro to create a lookup table mapping any incoming byte to a handler function defined below. /// #[rustfmt::skip] -static BYTE_HANDLERS: [ByteHandler; 256] = [ -// 0 1 2 3 4 5 6 7 8 9 A B C D E F // - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0 - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 - SPS, EXL, QOD, HAS, IDT, PRC, AMP, QOS, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 - ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 - AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 - IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 - TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 - L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 - UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 8 - UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 9 - UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // A - UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // B - UER, UER, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // F -]; +macro_rules! byte_handlers { + () => { + [ + // 0 1 2 3 4 5 6 7 8 9 A B C D E F // + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0 + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 + SPS, EXL, QOD, HAS, IDT, PRC, AMP, QOS, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 + AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 + IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 + TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 + L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 8 + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 9 + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // A + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // B + UER, UER, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E + UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // F + ] + }; +} + +pub mod byte_handler_tables { + use super::*; + + pub static NO_TOKENS: ByteHandlers = byte_handlers!(); + pub static WITH_TOKENS: ByteHandlers = byte_handlers!(); + pub static RUNTIME_TOKENS: ByteHandlers = byte_handlers!(); +} /// Macro for defining byte handler for an ASCII character. /// @@ -55,7 +72,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ /// next char is ASCII, and it uses that information to optimize the rest of the handler. /// e.g. `lexer.consume_char()` becomes just a single assembly instruction. /// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to -/// the indirection of the `BYTE_HANDLERS` jump table. +/// the indirection of the byte handlers jump table. /// /// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. /// @@ -73,7 +90,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ /// /// ``` /// #[expect(non_snake_case)] -/// fn SPS(lexer: &mut Lexer) { +/// fn SPS(lexer: &mut Lexer<'_, C>) -> Kind { /// // SAFETY: This macro is only used for ASCII characters /// unsafe { /// assert_unchecked!(!lexer.source.is_eof()); @@ -88,7 +105,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ macro_rules! ascii_byte_handler { ($id:ident($lex:ident) $body:expr) => { #[expect(non_snake_case)] - fn $id($lex: &mut Lexer) -> Kind { + fn $id($lex: &mut Lexer<'_, C>) -> Kind { // SAFETY: This macro is only used for ASCII characters unsafe { assert_unchecked!(!$lex.source.is_eof()); @@ -123,7 +140,7 @@ macro_rules! ascii_byte_handler { /// /// ``` /// #[expect(non_snake_case)] -/// fn L_G(lexer: &mut Lexer) -> Kind { +/// fn L_G(lexer: &mut Lexer<'_, C>) -> Kind { /// // SAFETY: This macro is only used for ASCII characters /// let id_without_first_char = unsafe { lexer.identifier_name_handler() }; /// match id_without_first_char { @@ -136,7 +153,7 @@ macro_rules! ascii_byte_handler { macro_rules! ascii_identifier_handler { ($id:ident($str:ident) $body:expr) => { #[expect(non_snake_case)] - fn $id(lexer: &mut Lexer) -> Kind { + fn $id(lexer: &mut Lexer<'_, C>) -> Kind { // SAFETY: This macro is only used for ASCII characters let $str = unsafe { lexer.identifier_name_handler() }; $body @@ -653,7 +670,7 @@ ascii_identifier_handler!(L_Y(id_without_first_char) match id_without_first_char // // Note: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII chars. #[expect(non_snake_case)] -fn UNI(lexer: &mut Lexer) -> Kind { +fn UNI(lexer: &mut Lexer<'_, C>) -> Kind { lexer.unicode_char_handler() } @@ -665,6 +682,6 @@ fn UNI(lexer: &mut Lexer) -> Kind { // // Note: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII bytes. #[expect(non_snake_case)] -fn UER(_lexer: &mut Lexer) -> Kind { +fn UER(_lexer: &mut Lexer<'_, C>) -> Kind { unreachable!(); } diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index aad0e5a1d503e..6cd49f47eb6a7 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -3,7 +3,7 @@ use memchr::memmem::Finder; use oxc_ast::CommentKind; use oxc_syntax::line_terminator::is_line_terminator; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{ Kind, Lexer, cold_branch, @@ -22,7 +22,7 @@ static LINE_BREAK_TABLE: SafeByteMatchTable = static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST)); -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { /// Section 12.4 Single Line Comment pub(super) fn skip_single_line_comment(&mut self) -> Kind { byte_search! { diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index c08957cddede4..6c1149718bae1 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -6,7 +6,7 @@ use oxc_syntax::identifier::{ is_identifier_part, is_identifier_part_unicode, is_identifier_start_unicode, }; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{ Kind, Lexer, SourcePosition, cold_branch, @@ -26,7 +26,7 @@ fn is_identifier_start_ascii_byte(byte: u8) -> bool { ASCII_ID_START_TABLE.matches(byte) } -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { /// Handle identifier with ASCII start character. /// Returns text of the identifier, minus its first char. /// diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs index db828ceb3f08b..eb2f08b58a3b1 100644 --- a/crates/oxc_parser/src/lexer/jsx.rs +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -3,7 +3,7 @@ use memchr::memchr; use oxc_span::Span; use oxc_syntax::identifier::is_identifier_part; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{ Kind, Lexer, Token, cold_branch, @@ -26,7 +26,7 @@ static JSX_CHILD_END_TABLE: SafeByteMatchTable = /// `JSXStringCharacter` but not ' /// `JSXStringCharacter` :: /// `SourceCharacter` but not one of `HTMLCharacterReference` -impl Lexer<'_> { +impl Lexer<'_, C> { /// Read JSX string literal. /// # SAFETY /// * `delimiter` must be an ASCII character. diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index f963a3518fef5..0f56ad7d16706 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -12,7 +12,7 @@ use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::OxcDiagnostic; use oxc_span::{SourceType, Span}; -use crate::{UniquePromise, diagnostics}; +use crate::{UniquePromise, config::LexerConfig as Config, diagnostics}; mod byte_handlers; mod comment; @@ -33,6 +33,7 @@ mod typescript; mod unicode; mod whitespace; +pub(crate) use byte_handlers::{ByteHandler, ByteHandlers, byte_handler_tables}; pub use kind::Kind; pub use number::{parse_big_int, parse_float, parse_int}; pub use token::Token; @@ -64,7 +65,7 @@ pub enum LexerContext { JsxAttributeValue, } -pub struct Lexer<'a> { +pub struct Lexer<'a, C: Config> { allocator: &'a Allocator, // Wrapper around source text. Must not be changed after initialization. @@ -100,11 +101,11 @@ pub struct Lexer<'a> { /// Collected tokens in source order. tokens: ArenaVec<'a, Token>, - /// Whether to collect tokens. - collect_tokens: bool, + /// Config + pub(crate) config: C, } -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { /// Create new `Lexer`. /// /// Requiring a `UniquePromise` to be provided guarantees only 1 `Lexer` can exist @@ -113,7 +114,7 @@ impl<'a> Lexer<'a> { allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, - collect_tokens: bool, + config: C, unique: UniquePromise, ) -> Self { let source = Source::new(source_text, unique); @@ -133,7 +134,7 @@ impl<'a> Lexer<'a> { escaped_templates: FxHashMap::default(), multi_line_comment_end_finder: None, tokens: ArenaVec::new_in(allocator), - collect_tokens, + config, } } @@ -144,9 +145,10 @@ impl<'a> Lexer<'a> { allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, + config: C, ) -> Self { let unique = UniquePromise::new_for_tests_and_benchmarks(); - Self::new(allocator, source_text, source_type, false, unique) + Self::new(allocator, source_text, source_type, config, unique) } /// Get errors. @@ -268,7 +270,7 @@ impl<'a> Lexer<'a> { self.token.set_kind(kind); self.token.set_end(self.offset()); let token = self.token; - if self.collect_tokens && !matches!(token.kind(), Kind::Eof | Kind::HashbangComment) { + if self.config.tokens() && !matches!(token.kind(), Kind::Eof | Kind::HashbangComment) { if REPLACE_SAME_START { debug_assert!(self.tokens.last().is_some_and(|last| last.start() == token.start())); let last = self.tokens.last_mut().unwrap(); diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs index 1c2bedd66cb31..351b8b406d599 100644 --- a/crates/oxc_parser/src/lexer/numeric.rs +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -1,10 +1,10 @@ use oxc_syntax::identifier::{is_identifier_part_ascii, is_identifier_start}; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{Kind, Lexer, Span}; -impl Lexer<'_> { +impl Lexer<'_, C> { /// 12.9.3 Numeric Literals with `0` prefix pub(super) fn read_zero(&mut self) -> Kind { match self.peek_byte() { diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs index a6586edc7d30c..99033fb3cd3f5 100644 --- a/crates/oxc_parser/src/lexer/punctuation.rs +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -1,9 +1,10 @@ use oxc_span::Span; +use crate::{config::LexerConfig as Config, diagnostics}; + use super::{Kind, Lexer, Token}; -use crate::diagnostics; -impl Lexer<'_> { +impl Lexer<'_, C> { /// Section 12.8 Punctuators pub(super) fn read_dot(&mut self) -> Kind { if self.peek_2_bytes() == Some([b'.', b'.']) { diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs index 2e19bad160cda..b0e5cb7f51b62 100644 --- a/crates/oxc_parser/src/lexer/regex.rs +++ b/crates/oxc_parser/src/lexer/regex.rs @@ -1,10 +1,10 @@ use oxc_syntax::line_terminator::is_line_terminator; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{Kind, Lexer, RegExpFlags, Token}; -impl Lexer<'_> { +impl Lexer<'_, C> { /// Re-tokenize the current `/` or `/=` and return `RegExp` /// See Section 12: /// The `InputElementRegExp` goal symbol is used in all syntactic grammar contexts diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 74a8b7aed16bb..cf4b7c37d9f05 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -2,7 +2,7 @@ use std::cmp::max; use oxc_allocator::StringBuilder; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{ Kind, Lexer, LexerContext, Span, Token, cold_branch, @@ -202,7 +202,7 @@ macro_rules! handle_string_literal_escape { } /// 12.9.4 String Literals -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { /// Read string literal delimited with `"`. /// # SAFETY /// Next character must be `"`. diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index b8cc64edcb7ec..0deee04066fc7 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -2,7 +2,7 @@ use std::{cmp::max, str}; use oxc_allocator::StringBuilder; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use super::{ Kind, Lexer, SourcePosition, Token, cold_branch, @@ -33,7 +33,7 @@ static TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_matc ); /// 12.8.6 Template Literal Lexical Components -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { /// Read template literal component. /// /// This function handles the common case where template contains no escapes or `\r` characters @@ -409,6 +409,8 @@ mod test { use oxc_allocator::Allocator; use oxc_span::SourceType; + use crate::config::NoTokensLexerConfig; + use super::super::{Kind, Lexer, UniquePromise}; #[test] @@ -442,8 +444,13 @@ mod test { fn run_test(source_text: String, expected_escaped: String, is_only_part: bool) { let allocator = Allocator::default(); let unique = UniquePromise::new_for_tests_and_benchmarks(); - let mut lexer = - Lexer::new(&allocator, &source_text, SourceType::default(), false, unique); + let mut lexer = Lexer::new( + &allocator, + &source_text, + SourceType::default(), + NoTokensLexerConfig, + unique, + ); let token = lexer.next_token(); assert_eq!( token.kind(), diff --git a/crates/oxc_parser/src/lexer/typescript.rs b/crates/oxc_parser/src/lexer/typescript.rs index a04731ce08875..7595964f1b07c 100644 --- a/crates/oxc_parser/src/lexer/typescript.rs +++ b/crates/oxc_parser/src/lexer/typescript.rs @@ -1,6 +1,8 @@ +use crate::config::LexerConfig as Config; + use super::{Kind, Lexer, Token}; -impl Lexer<'_> { +impl Lexer<'_, C> { /// Re-tokenize '<<' or '<=' or '<<=' to '<' pub(crate) fn re_lex_as_typescript_l_angle(&mut self, offset: u32) -> Token { self.token.set_start(self.offset() - offset); diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 15894b5f290dc..dabf07ac04910 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -2,7 +2,7 @@ use std::{borrow::Cow, fmt::Write}; use cow_utils::CowUtils; -use crate::diagnostics; +use crate::{config::LexerConfig as Config, diagnostics}; use oxc_allocator::StringBuilder; use oxc_syntax::{ identifier::{ @@ -29,7 +29,7 @@ enum UnicodeEscape { LoneSurrogate(u32), } -impl<'a> Lexer<'a> { +impl<'a, C: Config> Lexer<'a, C> { pub(super) fn unicode_char_handler(&mut self) -> Kind { let c = self.peek_char().unwrap(); match c { diff --git a/crates/oxc_parser/src/lexer/whitespace.rs b/crates/oxc_parser/src/lexer/whitespace.rs index a770362a37b18..bc73e88580d19 100644 --- a/crates/oxc_parser/src/lexer/whitespace.rs +++ b/crates/oxc_parser/src/lexer/whitespace.rs @@ -1,3 +1,5 @@ +use crate::config::LexerConfig as Config; + use super::{ Kind, Lexer, search::{SafeByteMatchTable, byte_search, safe_byte_match_table}, @@ -6,7 +8,7 @@ use super::{ static NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | b'\r' | b'\n')); -impl Lexer<'_> { +impl Lexer<'_, C> { pub(super) fn line_break_handler(&mut self) -> Kind { self.token.set_is_on_new_line(true); self.trivia_builder.handle_newline(); diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index 9b8b91fa7317e..320ee6b6725ae 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -64,6 +64,7 @@ //! //! See [full linter example](https://github.com/Boshen/oxc/blob/ab2ef4f89ba3ca50c68abb2ca43e36b7793f3673/crates/oxc_linter/examples/linter.rs#L38-L39) +pub mod config; mod context; mod cursor; mod error_handler; @@ -95,6 +96,7 @@ use oxc_syntax::module_record::ModuleRecord; pub use crate::lexer::{Kind, Token}; use crate::{ + config::{LexerConfig, NoTokensParserConfig, ParserConfig}, context::{Context, StatementContext}, error_handler::FatalError, lexer::Lexer, @@ -169,7 +171,7 @@ pub struct ParserReturn<'a> { /// Lexed tokens in source order. /// - /// Tokens are only collected when [`ParseOptions::collect_tokens`] is enabled. + /// Tokens are only collected when tokens are enabled in [`ParserConfig`]. pub tokens: oxc_allocator::Vec<'a, Token>, /// Whether the parser panicked and terminated early. @@ -225,11 +227,6 @@ pub struct ParseOptions { /// /// [`V8IntrinsicExpression`]: oxc_ast::ast::V8IntrinsicExpression pub allow_v8_intrinsics: bool, - - /// Collect lexer tokens and return them in [`ParserReturn::tokens`]. - /// - /// Default: `false` - pub collect_tokens: bool, } impl Default for ParseOptions { @@ -240,7 +237,6 @@ impl Default for ParseOptions { allow_return_outside_function: false, preserve_parens: true, allow_v8_intrinsics: false, - collect_tokens: false, } } } @@ -248,11 +244,12 @@ impl Default for ParseOptions { /// Recursive Descent Parser for ECMAScript and TypeScript /// /// See [`Parser::parse`] for entry function. -pub struct Parser<'a> { +pub struct Parser<'a, C: ParserConfig = NoTokensParserConfig> { allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, options: ParseOptions, + config: C, } impl<'a> Parser<'a> { @@ -264,15 +261,31 @@ impl<'a> Parser<'a> { /// - `source_type`: Source type (e.g. JavaScript, TypeScript, JSX, ESM Module, Script) pub fn new(allocator: &'a Allocator, source_text: &'a str, source_type: SourceType) -> Self { let options = ParseOptions::default(); - Self { allocator, source_text, source_type, options } + Self { allocator, source_text, source_type, options, config: NoTokensParserConfig } } +} +impl<'a, C: ParserConfig> Parser<'a, C> { /// Set parse options #[must_use] pub fn with_options(mut self, options: ParseOptions) -> Self { self.options = options; self } + + /// Set parser config. + /// + /// See [`ParserConfig`] for more details. + #[must_use] + pub fn with_config(self, config: Config) -> Parser<'a, Config> { + Parser { + allocator: self.allocator, + source_text: self.source_text, + source_type: self.source_type, + options: self.options, + config, + } + } } mod parser_parse { @@ -309,7 +322,7 @@ mod parser_parse { } } - impl<'a> Parser<'a> { + impl<'a, C: ParserConfig> Parser<'a, C> { /// Main entry point /// /// Returns an empty `Program` on unrecoverable error, @@ -323,6 +336,7 @@ mod parser_parse { self.source_text, self.source_type, self.options, + self.config, unique, ); parser.parse() @@ -354,6 +368,7 @@ mod parser_parse { self.source_text, self.source_type, self.options, + self.config, unique, ); parser.parse_expression() @@ -364,10 +379,11 @@ use parser_parse::UniquePromise; /// Implementation of parser. /// `Parser` is just a public wrapper, the guts of the implementation is in this type. -struct ParserImpl<'a> { +struct ParserImpl<'a, C: ParserConfig> { + /// Options options: ParseOptions, - pub(crate) lexer: Lexer<'a>, + pub(crate) lexer: Lexer<'a, C::LexerConfig>, /// SourceType: JavaScript or TypeScript, Script or Module, jsx support? source_type: SourceType, @@ -410,22 +426,24 @@ struct ParserImpl<'a> { is_ts: bool, } -impl<'a> ParserImpl<'a> { +impl<'a, C: ParserConfig> ParserImpl<'a, C> { /// Create a new `ParserImpl`. /// /// Requiring a `UniquePromise` to be provided guarantees only 1 `ParserImpl` can exist /// on a single thread at one time. #[inline] + #[expect(clippy::needless_pass_by_value)] pub fn new( allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, options: ParseOptions, + config: C, unique: UniquePromise, ) -> Self { Self { options, - lexer: Lexer::new(allocator, source_text, source_type, options.collect_tokens, unique), + lexer: Lexer::new(allocator, source_text, source_type, config.lexer_config(), unique), source_type, source_text, errors: vec![], @@ -580,7 +598,7 @@ impl<'a> ParserImpl<'a> { // Token stream is already complete from the first parse. // Reparsing here is only to patch AST nodes, so keep the original token stream. let original_tokens = - if self.options.collect_tokens { Some(self.lexer.take_tokens()) } else { None }; + if self.lexer.config.tokens() { Some(self.lexer.take_tokens()) } else { None }; let checkpoints = std::mem::take(&mut self.state.potential_await_reparse); for (stmt_index, checkpoint) in checkpoints { diff --git a/crates/oxc_parser/src/modifiers.rs b/crates/oxc_parser/src/modifiers.rs index fa29b164b3345..bbeec83d0a236 100644 --- a/crates/oxc_parser/src/modifiers.rs +++ b/crates/oxc_parser/src/modifiers.rs @@ -8,7 +8,7 @@ use oxc_diagnostics::OxcDiagnostic; use oxc_span::Span; use crate::{ - ParserImpl, diagnostics, + ParserConfig as Config, ParserImpl, diagnostics, lexer::{Kind, Token}, }; @@ -313,7 +313,7 @@ impl std::fmt::Display for ModifierKind { } } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn eat_modifiers_before_declaration(&mut self) -> Modifiers<'a> { if !self.at_modifier() { return Modifiers::empty(); @@ -624,8 +624,8 @@ impl<'a> ParserImpl<'a> { // Also `#[inline(never)]` to help `verify_modifiers` to get inlined. #[cold] #[inline(never)] - fn report<'a, F>( - parser: &mut ParserImpl<'a>, + fn report<'a, C: Config, F>( + parser: &mut ParserImpl<'a, C>, modifiers: &Modifiers<'a>, allowed: ModifierFlags, strict: bool, diff --git a/crates/oxc_parser/src/ts/statement.rs b/crates/oxc_parser/src/ts/statement.rs index b9c9e549d981f..0629d7145d271 100644 --- a/crates/oxc_parser/src/ts/statement.rs +++ b/crates/oxc_parser/src/ts/statement.rs @@ -3,7 +3,7 @@ use oxc_ast::ast::*; use oxc_span::{FileExtension, GetSpan}; use crate::{ - Context, ParserImpl, diagnostics, + Context, ParserConfig as Config, ParserImpl, diagnostics, js::{FunctionKind, VariableDeclarationParent}, lexer::Kind, modifiers::{ModifierFlags, ModifierKind, Modifiers}, @@ -15,7 +15,7 @@ pub(super) enum CallOrConstructorSignature { Constructor, } -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { /* ------------------- Enum ------------------ */ /// `https://www.typescriptlang.org/docs/handbook/enums.html` pub(crate) fn parse_ts_enum_declaration( diff --git a/crates/oxc_parser/src/ts/types.rs b/crates/oxc_parser/src/ts/types.rs index d7ee5e40148b5..cc9c560cc5967 100644 --- a/crates/oxc_parser/src/ts/types.rs +++ b/crates/oxc_parser/src/ts/types.rs @@ -4,14 +4,14 @@ use oxc_span::GetSpan; use oxc_syntax::operator::UnaryOperator; use crate::{ - Context, ParserImpl, diagnostics, + Context, ParserConfig as Config, ParserImpl, diagnostics, lexer::Kind, modifiers::{ModifierFlags, ModifierKind, Modifiers}, }; use super::{super::js::FunctionKind, statement::CallOrConstructorSignature}; -impl<'a> ParserImpl<'a> { +impl<'a, C: Config> ParserImpl<'a, C> { pub(crate) fn parse_ts_type(&mut self) -> TSType<'a> { if self.is_start_of_function_type_or_constructor_type() { return self.parse_function_or_constructor_type(); diff --git a/napi/playground/src/lib.rs b/napi/playground/src/lib.rs index 35209f60c29dd..260236f3158f4 100644 --- a/napi/playground/src/lib.rs +++ b/napi/playground/src/lib.rs @@ -222,7 +222,6 @@ impl Oxc { allow_return_outside_function: parser_options.allow_return_outside_function, preserve_parens: parser_options.preserve_parens, allow_v8_intrinsics: parser_options.allow_v8_intrinsics, - collect_tokens: false, }; let ParserReturn { program, errors, module_record, .. } = Parser::new(allocator, source_text, source_type).with_options(parser_options).parse(); diff --git a/tasks/benchmark/benches/lexer.rs b/tasks/benchmark/benches/lexer.rs index ed85f2922d972..c32befeaa51b4 100644 --- a/tasks/benchmark/benches/lexer.rs +++ b/tasks/benchmark/benches/lexer.rs @@ -8,6 +8,7 @@ use oxc_ast_visit::Visit; use oxc_benchmark::{BenchmarkId, Criterion, criterion_group, criterion_main}; use oxc_parser::{ Parser, + config::{LexerConfig, NoTokensLexerConfig}, lexer::{Kind, Lexer}, }; use oxc_span::SourceType; @@ -51,7 +52,7 @@ fn bench_lexer(criterion: &mut Criterion) { // so we do the same here. let mut allocator = Allocator::default(); b.iter(|| { - lex_whole_file(&allocator, source_text, source_type); + lex_whole_file(&allocator, source_text, source_type, NoTokensLexerConfig); allocator.reset(); }); }); @@ -66,12 +67,13 @@ criterion_main!(lexer); // It's also used in `SourceCleaner` below. #[expect(clippy::inline_always)] #[inline(always)] -fn lex_whole_file<'a>( +fn lex_whole_file<'a, C: LexerConfig>( allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, -) -> Lexer<'a> { - let mut lexer = Lexer::new_for_benchmarks(allocator, source_text, source_type); + config: C, +) -> Lexer<'a, C> { + let mut lexer = Lexer::new_for_benchmarks(allocator, source_text, source_type, config); if lexer.first_token().kind() != Kind::Eof { // Use `next_token_for_benchmarks` instead of `next_token`, to work around problem // where `next_token` wasn't inlined here. @@ -119,7 +121,7 @@ fn clean<'a>(source_text: &'a str, source_type: SourceType, allocator: &'a Alloc clean_source_text.push_str(&source_text[last_index..]); // Check lexer can lex it without any errors - let lexer = lex_whole_file(allocator, &clean_source_text, source_type); + let lexer = lex_whole_file(allocator, &clean_source_text, source_type, NoTokensLexerConfig); assert!(lexer.errors().is_empty()); clean_source_text diff --git a/tasks/coverage/src/tools.rs b/tasks/coverage/src/tools.rs index d5fe0dd277eb2..370d6baf9db4b 100644 --- a/tasks/coverage/src/tools.rs +++ b/tasks/coverage/src/tools.rs @@ -7,7 +7,7 @@ use oxc::{ ast_visit::utf8_to_utf16::Utf8ToUtf16, diagnostics::{GraphicalReportHandler, GraphicalTheme, NamedSource, OxcDiagnostic}, minifier::CompressOptions, - parser::{ParseOptions, Parser, ParserReturn}, + parser::{ParseOptions, Parser, ParserReturn, config::RuntimeParserConfig}, span::{ModuleKind, SourceType, Span}, transformer::{JsxOptions, JsxRuntime, TransformOptions}, }; @@ -838,8 +838,9 @@ pub fn run_estree_test262_tokens(files: &[Test262File]) -> Vec { let is_module = f.meta.flags.contains(&TestFlag::Module); let source_type = SourceType::script().with_module(is_module); let allocator = Allocator::new(); - let options = ParseOptions { collect_tokens: true, ..ParseOptions::default() }; - let ret = Parser::new(&allocator, &f.code, source_type).with_options(options).parse(); + let ret = Parser::new(&allocator, &f.code, source_type) + .with_config(RuntimeParserConfig::new(true)) + .parse(); if ret.panicked || !ret.errors.is_empty() { let error = @@ -887,7 +888,7 @@ pub fn run_estree_acorn_jsx_tokens(files: &[AcornJsxFile]) -> Vec Vec