diff --git a/apps/oxlint/src/js_plugins/parse.rs b/apps/oxlint/src/js_plugins/parse.rs index f1281d1af47c6..3624aa887afff 100644 --- a/apps/oxlint/src/js_plugins/parse.rs +++ b/apps/oxlint/src/js_plugins/parse.rs @@ -162,6 +162,7 @@ unsafe fn parse_raw_impl( .with_options(ParseOptions { parse_regular_expression: true, allow_return_outside_function: true, + collect_tokens: true, ..ParseOptions::default() }) .parse(); diff --git a/crates/oxc_formatter/src/service/mod.rs b/crates/oxc_formatter/src/service/mod.rs index 2ade92da4fccd..010ca3527a65e 100644 --- a/crates/oxc_formatter/src/service/mod.rs +++ b/crates/oxc_formatter/src/service/mod.rs @@ -11,6 +11,7 @@ pub fn get_parse_options() -> ParseOptions { allow_v8_intrinsics: true, // `oxc_formatter` expects this to be `false`, otherwise panics preserve_parens: false, + collect_tokens: false, } } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index ce91d1b015dbb..6887ec8afd0a6 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -6,8 +6,9 @@ //! * [v8](https://v8.dev/blog/scanner) use rustc_hash::FxHashMap; +use std::fmt::Debug; -use oxc_allocator::Allocator; +use oxc_allocator::{Allocator, Vec as ArenaVec}; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::OxcDiagnostic; use oxc_span::{SourceType, Span}; @@ -45,6 +46,7 @@ pub struct LexerCheckpoint<'a> { source_position: SourcePosition<'a>, token: Token, errors_snapshot: ErrorSnapshot, + tokens_len: usize, } #[derive(Debug, Clone)] @@ -86,6 +88,11 @@ pub struct Lexer<'a> { /// `memchr` Finder for end of multi-line comments. Created lazily when first used. multi_line_comment_end_finder: Option>, + + /// Tokens collected from the lexer. + /// + /// If `collect_tokens` is `false`, this will be `None`. + tokens: Option>, } impl<'a> Lexer<'a> { @@ -97,6 +104,7 @@ impl<'a> Lexer<'a> { allocator: &'a Allocator, source_text: &'a str, source_type: SourceType, + collect_tokens: bool, unique: UniquePromise, ) -> Self { let source = Source::new(source_text, unique); @@ -114,6 +122,7 @@ impl<'a> Lexer<'a> { escaped_strings: FxHashMap::default(), escaped_templates: FxHashMap::default(), multi_line_comment_end_finder: None, + tokens: if collect_tokens { Some(ArenaVec::new_in(allocator)) } else { None }, } } @@ -126,7 +135,7 @@ impl<'a> Lexer<'a> { source_type: SourceType, ) -> Self { let unique = UniquePromise::new_for_tests_and_benchmarks(); - Self::new(allocator, source_text, source_type, unique) + Self::new(allocator, source_text, source_type, false, unique) } /// Get errors. @@ -149,10 +158,15 @@ impl<'a> Lexer<'a> { } else { ErrorSnapshot::Count(self.errors.len()) }; + let tokens_len = match &self.tokens { + Some(tokens) => tokens.len(), + None => 0, + }; LexerCheckpoint { source_position: self.source.position(), token: self.token, errors_snapshot, + tokens_len, } } @@ -168,6 +182,10 @@ impl<'a> Lexer<'a> { source_position: self.source.position(), token: self.token, errors_snapshot, + tokens_len: match &self.tokens { + Some(tokens) => tokens.len(), + None => 0, + }, } } @@ -180,6 +198,9 @@ impl<'a> Lexer<'a> { } self.source.set_position(checkpoint.source_position); self.token = checkpoint.token; + if let Some(tokens) = self.tokens.as_mut() { + tokens.truncate(checkpoint.tokens_len); + } } pub fn peek_token(&mut self) -> Token { @@ -229,6 +250,9 @@ impl<'a> Lexer<'a> { self.token.set_end(self.offset()); let token = self.token; self.trivia_builder.handle_token(token); + if let Some(tokens) = self.tokens.as_mut() { + tokens.push(token); + } self.token = Token::default(); token } @@ -239,6 +263,19 @@ impl<'a> Lexer<'a> { self.source.advance_to_end(); } + /// Retrieve collected tokens. + /// This should only be called once per source text, as it consumes the tokens vec. + /// Panics if the lexer was not configured to collect tokens. + #[inline] + pub fn tokens(&mut self) -> ArenaVec<'a, Token> { + if let Some(tokens) = self.tokens.take() { + tokens + } else { + let backtrace = std::backtrace::Backtrace::capture(); + panic!("Can't retrieve tokens because they were not collected\n{backtrace}"); + } + } + // ---------- Private Methods ---------- // fn error(&mut self, error: OxcDiagnostic) { self.errors.push(error); diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 41d93e8d300d8..3147685c6a6f1 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -442,7 +442,8 @@ mod test { fn run_test(source_text: String, expected_escaped: String, is_only_part: bool) { let allocator = Allocator::default(); let unique = UniquePromise::new_for_tests_and_benchmarks(); - let mut lexer = Lexer::new(&allocator, &source_text, SourceType::default(), unique); + let mut lexer = + Lexer::new(&allocator, &source_text, SourceType::default(), false, unique); let token = lexer.next_token(); assert_eq!( token.kind(), diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index e10e84355b328..6f449a910b3e2 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -79,6 +79,8 @@ mod ts; mod diagnostics; +pub use lexer::{Kind, Token}; + // Expose lexer only in benchmarks #[cfg(not(feature = "benchmarking"))] mod lexer; @@ -86,7 +88,7 @@ mod lexer; #[doc(hidden)] pub mod lexer; -use oxc_allocator::{Allocator, Box as ArenaBox, Dummy}; +use oxc_allocator::{Allocator, Box as ArenaBox, Dummy, Vec as ArenaVec}; use oxc_ast::{ AstBuilder, ast::{Expression, Program}, @@ -98,7 +100,7 @@ use oxc_syntax::module_record::ModuleRecord; use crate::{ context::{Context, StatementContext}, error_handler::FatalError, - lexer::{Lexer, Token}, + lexer::Lexer, module_record::ModuleRecordBuilder, state::ParserState, }; @@ -180,6 +182,11 @@ pub struct ParserReturn<'a> { /// Whether the file is [flow](https://flow.org). pub is_flow_language: bool, + + /// All tokens collected from the lexer, in source order. + /// + /// If `ParseOptions::collect_tokens` is set to `false`, this will be `None`. + pub tokens: Option>, } /// Parse options @@ -221,6 +228,13 @@ pub struct ParseOptions { /// /// [`V8IntrinsicExpression`]: oxc_ast::ast::V8IntrinsicExpression pub allow_v8_intrinsics: bool, + + /// Whether the parser should collect all tokens from the lexer and return them in the `tokens` field of the `ParserReturn` struct. + /// + /// If this option is set to `false`, the `tokens` vec will be empty. + /// + /// Default: `false` + pub collect_tokens: bool, } impl Default for ParseOptions { @@ -231,6 +245,7 @@ impl Default for ParseOptions { allow_return_outside_function: false, preserve_parens: true, allow_v8_intrinsics: false, + collect_tokens: false, } } } @@ -408,7 +423,7 @@ impl<'a> ParserImpl<'a> { ) -> Self { Self { options, - lexer: Lexer::new(allocator, source_text, source_type, unique), + lexer: Lexer::new(allocator, source_text, source_type, options.collect_tokens, unique), source_type, source_text, errors: vec![], @@ -465,6 +480,7 @@ impl<'a> ParserImpl<'a> { errors.push(error); } let (module_record, module_record_errors) = self.module_record_builder.build(); + let tokens = if self.options.collect_tokens { Some(self.lexer.tokens()) } else { None }; if errors.len() != 1 { errors.reserve(self.lexer.errors.len() + self.errors.len()); errors.extend(self.lexer.errors); @@ -493,6 +509,7 @@ impl<'a> ParserImpl<'a> { irregular_whitespaces, panicked, is_flow_language, + tokens, } } diff --git a/napi/playground/src/lib.rs b/napi/playground/src/lib.rs index a482d316b9d34..d916e26de7113 100644 --- a/napi/playground/src/lib.rs +++ b/napi/playground/src/lib.rs @@ -218,6 +218,7 @@ impl Oxc { ) -> (Program<'a>, oxc::syntax::module_record::ModuleRecord<'a>) { let parser_options = ParseOptions { parse_regular_expression: true, + collect_tokens: false, allow_return_outside_function: parser_options.allow_return_outside_function, preserve_parens: parser_options.preserve_parens, allow_v8_intrinsics: parser_options.allow_v8_intrinsics, diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs index d0a83dc22a749..7ef65f8b4a0a6 100644 --- a/tasks/benchmark/benches/parser.rs +++ b/tasks/benchmark/benches/parser.rs @@ -44,6 +44,7 @@ fn bench_parser_with_tokens(criterion: &mut Criterion) { Parser::new(&allocator, source_text, source_type) .with_options(ParseOptions { parse_regular_expression: true, + collect_tokens: true, ..ParseOptions::default() }) .parse();