From b2b7a552f317fa00c75fd78354b11ee61022675e Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Tue, 24 Feb 2026 13:43:04 +0000 Subject: [PATCH] fix(estree/tokens): generate tokens for files with BOM (#19535) Fix a bug where we didn't produce any tokens for files which start with a BOM. As a side effect, re-use the existing `Utf8ToUtf16` span converter created for converting spans in the AST again for tokens, rather than creating another one. --- apps/oxlint/src/js_plugins/parse.rs | 31 +++-- apps/oxlint/test/fixtures/tokens/files/bom.js | 1 + .../test/fixtures/tokens/files/unicode.js | 3 + .../test/fixtures/tokens/output.snap.md | 107 +++++++++++++++++- apps/oxlint/test/fixtures/tokens/plugin.ts | 3 + crates/oxc_estree_tokens/src/lib.rs | 23 ++-- crates/oxc_linter/src/lib.rs | 30 +++-- tasks/benchmark/benches/parser.rs | 2 + tasks/coverage/src/tools.rs | 22 +++- 9 files changed, 173 insertions(+), 49 deletions(-) create mode 100644 apps/oxlint/test/fixtures/tokens/files/bom.js create mode 100644 apps/oxlint/test/fixtures/tokens/files/unicode.js diff --git a/apps/oxlint/src/js_plugins/parse.rs b/apps/oxlint/src/js_plugins/parse.rs index bbce63c434a27..8fd9772b2f985 100644 --- a/apps/oxlint/src/js_plugins/parse.rs +++ b/apps/oxlint/src/js_plugins/parse.rs @@ -197,7 +197,8 @@ unsafe fn parse_raw_impl( const BOM: &str = "\u{feff}"; const BOM_LEN: usize = BOM.len(); - let mut source_text = program.source_text; + let original_source_text = program.source_text; + let mut source_text = original_source_text; let has_bom = source_text.starts_with(BOM); if has_bom { source_text = &source_text[BOM_LEN..]; @@ -216,22 +217,18 @@ unsafe fn parse_raw_impl( span_converter.convert_program(program); span_converter.convert_comments(&mut program.comments); - let (tokens_offset, tokens_len) = if has_bom { - // Fallback to TypeScript token parsing in JS for BOM files. - (0, 0) - } else { - let tokens_json = to_estree_tokens_json( - &tokens, - program, - EstreeTokenOptions::linter(), - &allocator, - ); - let tokens_json = allocator.alloc_str(&tokens_json); - let tokens_offset = tokens_json.as_ptr() as u32; - #[expect(clippy::cast_possible_truncation)] - let tokens_len = tokens_json.len() as u32; - (tokens_offset, tokens_len) - }; + let tokens_json = to_estree_tokens_json( + &tokens, + program, + original_source_text, + &span_converter, + EstreeTokenOptions::linter(), + &allocator, + ); + let tokens_json = allocator.alloc_str(&tokens_json); + let tokens_offset = tokens_json.as_ptr() as u32; + #[expect(clippy::cast_possible_truncation)] + let tokens_len = tokens_json.len() as u32; // Return offset of `Program` within buffer (bottom 32 bits of pointer) let program_offset = ptr::from_ref(program) as u32; diff --git a/apps/oxlint/test/fixtures/tokens/files/bom.js b/apps/oxlint/test/fixtures/tokens/files/bom.js new file mode 100644 index 0000000000000..3a48f8e325ea0 --- /dev/null +++ b/apps/oxlint/test/fixtures/tokens/files/bom.js @@ -0,0 +1 @@ +a = b; diff --git a/apps/oxlint/test/fixtures/tokens/files/unicode.js b/apps/oxlint/test/fixtures/tokens/files/unicode.js new file mode 100644 index 0000000000000..08dfdc8cc6c45 --- /dev/null +++ b/apps/oxlint/test/fixtures/tokens/files/unicode.js @@ -0,0 +1,3 @@ +a; +// 😀🤪😆😎🤮 +b; diff --git a/apps/oxlint/test/fixtures/tokens/output.snap.md b/apps/oxlint/test/fixtures/tokens/output.snap.md index 2b34d967c66c3..69bf23f01de4b 100644 --- a/apps/oxlint/test/fixtures/tokens/output.snap.md +++ b/apps/oxlint/test/fixtures/tokens/output.snap.md @@ -3,6 +3,50 @@ # stdout ``` + x tokens-plugin(tokens): Identifier ("a") + ,-[files/bom.js:1:4] + 1 | a = b; + : ^ + `---- + + x tokens-plugin(tokens): Tokens and comments: + | Identifier loc= 1:0 - 1:1 range= 0-1 "a" + | Punctuator loc= 1:2 - 1:3 range= 2-3 "=" + | Identifier loc= 1:4 - 1:5 range= 4-5 "b" + | Punctuator loc= 1:5 - 1:6 range= 5-6 ";" + ,-[files/bom.js:1:4] + 1 | a = b; + : ^^^^^^^ + `---- + + x tokens-plugin(tokens): Tokens: + | Identifier loc= 1:0 - 1:1 range= 0-1 "a" + | Punctuator loc= 1:2 - 1:3 range= 2-3 "=" + | Identifier loc= 1:4 - 1:5 range= 4-5 "b" + | Punctuator loc= 1:5 - 1:6 range= 5-6 ";" + ,-[files/bom.js:1:4] + 1 | a = b; + : ^^^^^^^ + `---- + + x tokens-plugin(tokens): Punctuator ("=") + ,-[files/bom.js:1:6] + 1 | a = b; + : ^ + `---- + + x tokens-plugin(tokens): Identifier ("b") + ,-[files/bom.js:1:8] + 1 | a = b; + : ^ + `---- + + x tokens-plugin(tokens): Punctuator (";") + ,-[files/bom.js:1:9] + 1 | a = b; + : ^ + `---- + x tokens-plugin(tokens): Keyword ("const") ,-[files/generic_arrow.ts:1:1] 1 | const obj = { @@ -1071,8 +1115,67 @@ : ^ `---- -Found 0 warnings and 109 errors. -Finished in Xms on 4 files with 1 rules using X threads. + x tokens-plugin(tokens): Identifier ("a") + ,-[files/unicode.js:1:1] + 1 | a; + : ^ + 2 | // 😀🤪😆😎🤮 + `---- + + x tokens-plugin(tokens): Tokens and comments: + | Identifier loc= 1:0 - 1:1 range= 0-1 "a" + | Punctuator loc= 1:1 - 1:2 range= 1-2 ";" + | Line loc= 2:0 - 2:13 range= 3-16 " 😀🤪😆😎🤮" + | Identifier loc= 3:0 - 3:1 range= 17-18 "b" + | Punctuator loc= 3:1 - 3:2 range= 18-19 ";" + ,-[files/unicode.js:1:1] + 1 | ,-> a; + 2 | | // 😀🤪😆😎🤮 + 3 | `-> b; + `---- + + x tokens-plugin(tokens): Tokens: + | Identifier loc= 1:0 - 1:1 range= 0-1 "a" + | Punctuator loc= 1:1 - 1:2 range= 1-2 ";" + | Identifier loc= 3:0 - 3:1 range= 17-18 "b" + | Punctuator loc= 3:1 - 3:2 range= 18-19 ";" + ,-[files/unicode.js:1:1] + 1 | ,-> a; + 2 | | // 😀🤪😆😎🤮 + 3 | `-> b; + `---- + + x tokens-plugin(tokens): Punctuator (";") + ,-[files/unicode.js:1:2] + 1 | a; + : ^ + 2 | // 😀🤪😆😎🤮 + `---- + + x tokens-plugin(tokens): Line (" 😀🤪😆😎🤮") + ,-[files/unicode.js:2:1] + 1 | a; + 2 | // 😀🤪😆😎🤮 + : ^^^^^^^^^^^^^ + 3 | b; + `---- + + x tokens-plugin(tokens): Identifier ("b") + ,-[files/unicode.js:3:1] + 2 | // 😀🤪😆😎🤮 + 3 | b; + : ^ + `---- + + x tokens-plugin(tokens): Punctuator (";") + ,-[files/unicode.js:3:2] + 2 | // 😀🤪😆😎🤮 + 3 | b; + : ^ + `---- + +Found 0 warnings and 122 errors. +Finished in Xms on 6 files with 1 rules using X threads. ``` # stderr diff --git a/apps/oxlint/test/fixtures/tokens/plugin.ts b/apps/oxlint/test/fixtures/tokens/plugin.ts index 8fab8aa3084c2..92d9b0d2ce1d0 100644 --- a/apps/oxlint/test/fixtures/tokens/plugin.ts +++ b/apps/oxlint/test/fixtures/tokens/plugin.ts @@ -13,6 +13,9 @@ const rule: Rule = { const { ast } = sourceCode; + // Ensure that `bom.js` does have a BOM (guarding against it being accidentally removed by e.g. formatting) + if (context.filename.endsWith("bom.js")) assert(sourceCode.hasBOM); + for (const tokenOrComment of tokensAndComments) { // Check getting `range` / `loc` properties twice results in same objects const { range, loc } = tokenOrComment; diff --git a/crates/oxc_estree_tokens/src/lib.rs b/crates/oxc_estree_tokens/src/lib.rs index 58b90cf7d74f2..a8dcbc81b2698 100644 --- a/crates/oxc_estree_tokens/src/lib.rs +++ b/crates/oxc_estree_tokens/src/lib.rs @@ -69,13 +69,19 @@ impl EstreeTokenOptions { } /// Serialize tokens to JSON. +/// +/// `source_text` must be the original source text, prior to BOM removal. +/// i.e. BOM must be present on start of `source_text`, if the file has a BOM. pub fn to_estree_tokens_json( tokens: &[Token], program: &Program<'_>, + source_text: &str, + span_converter: &Utf8ToUtf16, options: EstreeTokenOptions, allocator: &Allocator, ) -> String { - let estree_tokens = to_estree_tokens(tokens, program, options, allocator); + let estree_tokens = + to_estree_tokens(tokens, program, source_text, span_converter, options, allocator); serde_json::to_string_pretty(&estree_tokens).unwrap_or_default() } @@ -83,6 +89,8 @@ pub fn to_estree_tokens_json( fn to_estree_tokens<'a>( tokens: &[Token], program: &Program<'a>, + source_text: &'a str, + span_converter: &Utf8ToUtf16, options: EstreeTokenOptions, allocator: &'a Allocator, ) -> ArenaVec<'a, EstreeToken<'a>> { @@ -95,12 +103,9 @@ fn to_estree_tokens<'a>( }; context.visit_program(program); - // Create UTF-8 to UTF-16 conversion table - let source_text = program.source_text; - let utf8_to_utf16 = Utf8ToUtf16::new(source_text); - let mut converter = utf8_to_utf16.converter(); - // Convert tokens to `EstreeToken`s + let mut span_converter = span_converter.converter(); + let mut estree_tokens = ArenaVec::with_capacity_in(tokens.len(), allocator); for token in tokens { let kind = token.kind(); @@ -108,9 +113,9 @@ fn to_estree_tokens<'a>( let mut start = token.start(); let mut end = token.end(); - if let Some(converter) = converter.as_mut() { - converter.convert_offset(&mut start); - converter.convert_offset(&mut end); + if let Some(span_converter) = span_converter.as_mut() { + span_converter.convert_offset(&mut start); + span_converter.convert_offset(&mut end); } let span_utf16 = Span::new(start, end); diff --git a/crates/oxc_linter/src/lib.rs b/crates/oxc_linter/src/lib.rs index 54aa546f5e574..8875d5a6efd9b 100644 --- a/crates/oxc_linter/src/lib.rs +++ b/crates/oxc_linter/src/lib.rs @@ -572,28 +572,24 @@ impl Linter { span_converter.convert_program(program); span_converter.convert_comments(&mut program.comments); - let (tokens_offset, tokens_len) = if has_bom { - // Keep JS fallback path for BOM sources. - (0, 0) - } else if let Some(parser_tokens) = ctx_host.current_sub_host().parser_tokens() { - let tokens_json = to_estree_tokens_json( - parser_tokens, - program, - EstreeTokenOptions::linter(), - allocator, - ); - if tokens_json.is_empty() { - (0, 0) - } else { + let (tokens_offset, tokens_len) = + if let Some(tokens) = ctx_host.current_sub_host().parser_tokens() { + let tokens_json = to_estree_tokens_json( + tokens, + program, + original_source_text, + &span_converter, + EstreeTokenOptions::linter(), + allocator, + ); let tokens_json = allocator.alloc_str(&tokens_json); let tokens_offset = tokens_json.as_ptr() as u32; #[expect(clippy::cast_possible_truncation)] let tokens_len = tokens_json.len() as u32; (tokens_offset, tokens_len) - } - } else { - (0, 0) - }; + } else { + (0, 0) + }; // Get offset of `Program` within buffer (bottom 32 bits of pointer) let program_offset = ptr::from_ref(program) as u32; diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs index b5d7cb7681045..700d2b3380846 100644 --- a/tasks/benchmark/benches/parser.rs +++ b/tasks/benchmark/benches/parser.rs @@ -143,6 +143,8 @@ fn bench_estree_tokens(criterion: &mut Criterion) { let tokens_json = to_estree_tokens_json( &tokens, &program, + program.source_text, + &span_converter, EstreeTokenOptions::test262(), &allocator, ); diff --git a/tasks/coverage/src/tools.rs b/tasks/coverage/src/tools.rs index 500a3fa18ef10..d5b9b417fec5b 100644 --- a/tasks/coverage/src/tools.rs +++ b/tasks/coverage/src/tools.rs @@ -854,8 +854,14 @@ pub fn run_estree_test262_tokens(files: &[Test262File]) -> Vec { let span_converter = Utf8ToUtf16::new(source_text); span_converter.convert_program_with_ascending_order_checks(&mut program); - let oxc_tokens_json = - to_estree_tokens_json(&tokens, &program, EstreeTokenOptions::test262(), &allocator); + let oxc_tokens_json = to_estree_tokens_json( + &tokens, + &program, + source_text, + &span_converter, + EstreeTokenOptions::test262(), + &allocator, + ); let token_path = workspace_root() .join("estree-conformance/tests/test262-tokens") @@ -898,8 +904,14 @@ pub fn run_estree_acorn_jsx_tokens(files: &[AcornJsxFile]) -> Vec Vec