From b2b7a552f317fa00c75fd78354b11ee61022675e Mon Sep 17 00:00:00 2001
From: overlookmotel <557937+overlookmotel@users.noreply.github.com>
Date: Tue, 24 Feb 2026 13:43:04 +0000
Subject: [PATCH] fix(estree/tokens): generate tokens for files with BOM
 (#19535)

Fix a bug where we didn't produce any tokens for files which start with a BOM.

As a side effect, re-use the existing `Utf8ToUtf16` span converter created for converting spans in the AST again for tokens, rather than creating another one.
---
 apps/oxlint/src/js_plugins/parse.rs           |  31 +++--
 apps/oxlint/test/fixtures/tokens/files/bom.js |   1 +
 .../test/fixtures/tokens/files/unicode.js     |   3 +
 .../test/fixtures/tokens/output.snap.md       | 107 +++++++++++++++++-
 apps/oxlint/test/fixtures/tokens/plugin.ts    |   3 +
 crates/oxc_estree_tokens/src/lib.rs           |  23 ++--
 crates/oxc_linter/src/lib.rs                  |  30 +++--
 tasks/benchmark/benches/parser.rs             |   2 +
 tasks/coverage/src/tools.rs                   |  22 +++-
 9 files changed, 173 insertions(+), 49 deletions(-)
 create mode 100644 apps/oxlint/test/fixtures/tokens/files/bom.js
 create mode 100644 apps/oxlint/test/fixtures/tokens/files/unicode.js

diff --git a/apps/oxlint/src/js_plugins/parse.rs b/apps/oxlint/src/js_plugins/parse.rs
index bbce63c434a27..8fd9772b2f985 100644
--- a/apps/oxlint/src/js_plugins/parse.rs
+++ b/apps/oxlint/src/js_plugins/parse.rs
@@ -197,7 +197,8 @@ unsafe fn parse_raw_impl(
             const BOM: &str = "\u{feff}";
             const BOM_LEN: usize = BOM.len();
 
-            let mut source_text = program.source_text;
+            let original_source_text = program.source_text;
+            let mut source_text = original_source_text;
             let has_bom = source_text.starts_with(BOM);
             if has_bom {
                 source_text = &source_text[BOM_LEN..];
@@ -216,22 +217,18 @@ unsafe fn parse_raw_impl(
             span_converter.convert_program(program);
             span_converter.convert_comments(&mut program.comments);
 
-            let (tokens_offset, tokens_len) = if has_bom {
-                // Fallback to TypeScript token parsing in JS for BOM files.
-                (0, 0)
-            } else {
-                let tokens_json = to_estree_tokens_json(
-                    &tokens,
-                    program,
-                    EstreeTokenOptions::linter(),
-                    &allocator,
-                );
-                let tokens_json = allocator.alloc_str(&tokens_json);
-                let tokens_offset = tokens_json.as_ptr() as u32;
-                #[expect(clippy::cast_possible_truncation)]
-                let tokens_len = tokens_json.len() as u32;
-                (tokens_offset, tokens_len)
-            };
+            let tokens_json = to_estree_tokens_json(
+                &tokens,
+                program,
+                original_source_text,
+                &span_converter,
+                EstreeTokenOptions::linter(),
+                &allocator,
+            );
+            let tokens_json = allocator.alloc_str(&tokens_json);
+            let tokens_offset = tokens_json.as_ptr() as u32;
+            #[expect(clippy::cast_possible_truncation)]
+            let tokens_len = tokens_json.len() as u32;
 
             // Return offset of `Program` within buffer (bottom 32 bits of pointer)
             let program_offset = ptr::from_ref(program) as u32;
diff --git a/apps/oxlint/test/fixtures/tokens/files/bom.js b/apps/oxlint/test/fixtures/tokens/files/bom.js
new file mode 100644
index 0000000000000..3a48f8e325ea0
--- /dev/null
+++ b/apps/oxlint/test/fixtures/tokens/files/bom.js
@@ -0,0 +1 @@
+﻿a = b;
diff --git a/apps/oxlint/test/fixtures/tokens/files/unicode.js b/apps/oxlint/test/fixtures/tokens/files/unicode.js
new file mode 100644
index 0000000000000..08dfdc8cc6c45
--- /dev/null
+++ b/apps/oxlint/test/fixtures/tokens/files/unicode.js
@@ -0,0 +1,3 @@
+a;
+// 😀🤪😆😎🤮
+b;
diff --git a/apps/oxlint/test/fixtures/tokens/output.snap.md b/apps/oxlint/test/fixtures/tokens/output.snap.md
index 2b34d967c66c3..69bf23f01de4b 100644
--- a/apps/oxlint/test/fixtures/tokens/output.snap.md
+++ b/apps/oxlint/test/fixtures/tokens/output.snap.md
@@ -3,6 +3,50 @@
 
 # stdout
 ```
+  x tokens-plugin(tokens): Identifier ("a")
+   ,-[files/bom.js:1:4]
+ 1 | ﻿a = b;
+   : ^
+   `----
+
+  x tokens-plugin(tokens): Tokens and comments:
+  | Identifier        loc= 1:0 - 1:1    range= 0-1     "a"
+  | Punctuator        loc= 1:2 - 1:3    range= 2-3     "="
+  | Identifier        loc= 1:4 - 1:5    range= 4-5     "b"
+  | Punctuator        loc= 1:5 - 1:6    range= 5-6     ";"
+   ,-[files/bom.js:1:4]
+ 1 | ﻿a = b;
+   : ^^^^^^^
+   `----
+
+  x tokens-plugin(tokens): Tokens:
+  | Identifier        loc= 1:0 - 1:1    range= 0-1     "a"
+  | Punctuator        loc= 1:2 - 1:3    range= 2-3     "="
+  | Identifier        loc= 1:4 - 1:5    range= 4-5     "b"
+  | Punctuator        loc= 1:5 - 1:6    range= 5-6     ";"
+   ,-[files/bom.js:1:4]
+ 1 | ﻿a = b;
+   : ^^^^^^^
+   `----
+
+  x tokens-plugin(tokens): Punctuator ("=")
+   ,-[files/bom.js:1:6]
+ 1 | ﻿a = b;
+   :   ^
+   `----
+
+  x tokens-plugin(tokens): Identifier ("b")
+   ,-[files/bom.js:1:8]
+ 1 | ﻿a = b;
+   :     ^
+   `----
+
+  x tokens-plugin(tokens): Punctuator (";")
+   ,-[files/bom.js:1:9]
+ 1 | ﻿a = b;
+   :      ^
+   `----
+
   x tokens-plugin(tokens): Keyword ("const")
    ,-[files/generic_arrow.ts:1:1]
  1 | const obj = {
@@ -1071,8 +1115,67 @@
    :  ^
    `----
 
-Found 0 warnings and 109 errors.
-Finished in Xms on 4 files with 1 rules using X threads.
+  x tokens-plugin(tokens): Identifier ("a")
+   ,-[files/unicode.js:1:1]
+ 1 | a;
+   : ^
+ 2 | // 😀🤪😆😎🤮
+   `----
+
+  x tokens-plugin(tokens): Tokens and comments:
+  | Identifier        loc= 1:0 - 1:1    range= 0-1     "a"
+  | Punctuator        loc= 1:1 - 1:2    range= 1-2     ";"
+  | Line              loc= 2:0 - 2:13   range= 3-16    " 😀🤪😆😎🤮"
+  | Identifier        loc= 3:0 - 3:1    range= 17-18   "b"
+  | Punctuator        loc= 3:1 - 3:2    range= 18-19   ";"
+   ,-[files/unicode.js:1:1]
+ 1 | ,-> a;
+ 2 | |   // 😀🤪😆😎🤮
+ 3 | `-> b;
+   `----
+
+  x tokens-plugin(tokens): Tokens:
+  | Identifier        loc= 1:0 - 1:1    range= 0-1     "a"
+  | Punctuator        loc= 1:1 - 1:2    range= 1-2     ";"
+  | Identifier        loc= 3:0 - 3:1    range= 17-18   "b"
+  | Punctuator        loc= 3:1 - 3:2    range= 18-19   ";"
+   ,-[files/unicode.js:1:1]
+ 1 | ,-> a;
+ 2 | |   // 😀🤪😆😎🤮
+ 3 | `-> b;
+   `----
+
+  x tokens-plugin(tokens): Punctuator (";")
+   ,-[files/unicode.js:1:2]
+ 1 | a;
+   :  ^
+ 2 | // 😀🤪😆😎🤮
+   `----
+
+  x tokens-plugin(tokens): Line (" 😀🤪😆😎🤮")
+   ,-[files/unicode.js:2:1]
+ 1 | a;
+ 2 | // 😀🤪😆😎🤮
+   : ^^^^^^^^^^^^^
+ 3 | b;
+   `----
+
+  x tokens-plugin(tokens): Identifier ("b")
+   ,-[files/unicode.js:3:1]
+ 2 | // 😀🤪😆😎🤮
+ 3 | b;
+   : ^
+   `----
+
+  x tokens-plugin(tokens): Punctuator (";")
+   ,-[files/unicode.js:3:2]
+ 2 | // 😀🤪😆😎🤮
+ 3 | b;
+   :  ^
+   `----
+
+Found 0 warnings and 122 errors.
+Finished in Xms on 6 files with 1 rules using X threads.
 ```
 
 # stderr
diff --git a/apps/oxlint/test/fixtures/tokens/plugin.ts b/apps/oxlint/test/fixtures/tokens/plugin.ts
index 8fab8aa3084c2..92d9b0d2ce1d0 100644
--- a/apps/oxlint/test/fixtures/tokens/plugin.ts
+++ b/apps/oxlint/test/fixtures/tokens/plugin.ts
@@ -13,6 +13,9 @@ const rule: Rule = {
 
     const { ast } = sourceCode;
 
+    // Ensure that `bom.js` does have a BOM (guarding against it being accidentally removed by e.g. formatting)
+    if (context.filename.endsWith("bom.js")) assert(sourceCode.hasBOM);
+
     for (const tokenOrComment of tokensAndComments) {
       // Check getting `range` / `loc` properties twice results in same objects
       const { range, loc } = tokenOrComment;
diff --git a/crates/oxc_estree_tokens/src/lib.rs b/crates/oxc_estree_tokens/src/lib.rs
index 58b90cf7d74f2..a8dcbc81b2698 100644
--- a/crates/oxc_estree_tokens/src/lib.rs
+++ b/crates/oxc_estree_tokens/src/lib.rs
@@ -69,13 +69,19 @@ impl EstreeTokenOptions {
 }
 
 /// Serialize tokens to JSON.
+///
+/// `source_text` must be the original source text, prior to BOM removal.
+/// i.e. BOM must be present on start of `source_text`, if the file has a BOM.
 pub fn to_estree_tokens_json(
     tokens: &[Token],
     program: &Program<'_>,
+    source_text: &str,
+    span_converter: &Utf8ToUtf16,
     options: EstreeTokenOptions,
     allocator: &Allocator,
 ) -> String {
-    let estree_tokens = to_estree_tokens(tokens, program, options, allocator);
+    let estree_tokens =
+        to_estree_tokens(tokens, program, source_text, span_converter, options, allocator);
     serde_json::to_string_pretty(&estree_tokens).unwrap_or_default()
 }
 
@@ -83,6 +89,8 @@ pub fn to_estree_tokens_json(
 fn to_estree_tokens<'a>(
     tokens: &[Token],
     program: &Program<'a>,
+    source_text: &'a str,
+    span_converter: &Utf8ToUtf16,
     options: EstreeTokenOptions,
     allocator: &'a Allocator,
 ) -> ArenaVec<'a, EstreeToken<'a>> {
@@ -95,12 +103,9 @@ fn to_estree_tokens<'a>(
     };
     context.visit_program(program);
 
-    // Create UTF-8 to UTF-16 conversion table
-    let source_text = program.source_text;
-    let utf8_to_utf16 = Utf8ToUtf16::new(source_text);
-    let mut converter = utf8_to_utf16.converter();
-
     // Convert tokens to `EstreeToken`s
+    let mut span_converter = span_converter.converter();
+
     let mut estree_tokens = ArenaVec::with_capacity_in(tokens.len(), allocator);
     for token in tokens {
         let kind = token.kind();
@@ -108,9 +113,9 @@ fn to_estree_tokens<'a>(
 
         let mut start = token.start();
         let mut end = token.end();
-        if let Some(converter) = converter.as_mut() {
-            converter.convert_offset(&mut start);
-            converter.convert_offset(&mut end);
+        if let Some(span_converter) = span_converter.as_mut() {
+            span_converter.convert_offset(&mut start);
+            span_converter.convert_offset(&mut end);
         }
         let span_utf16 = Span::new(start, end);
 
diff --git a/crates/oxc_linter/src/lib.rs b/crates/oxc_linter/src/lib.rs
index 54aa546f5e574..8875d5a6efd9b 100644
--- a/crates/oxc_linter/src/lib.rs
+++ b/crates/oxc_linter/src/lib.rs
@@ -572,28 +572,24 @@ impl Linter {
         span_converter.convert_program(program);
         span_converter.convert_comments(&mut program.comments);
 
-        let (tokens_offset, tokens_len) = if has_bom {
-            // Keep JS fallback path for BOM sources.
-            (0, 0)
-        } else if let Some(parser_tokens) = ctx_host.current_sub_host().parser_tokens() {
-            let tokens_json = to_estree_tokens_json(
-                parser_tokens,
-                program,
-                EstreeTokenOptions::linter(),
-                allocator,
-            );
-            if tokens_json.is_empty() {
-                (0, 0)
-            } else {
+        let (tokens_offset, tokens_len) =
+            if let Some(tokens) = ctx_host.current_sub_host().parser_tokens() {
+                let tokens_json = to_estree_tokens_json(
+                    tokens,
+                    program,
+                    original_source_text,
+                    &span_converter,
+                    EstreeTokenOptions::linter(),
+                    allocator,
+                );
                 let tokens_json = allocator.alloc_str(&tokens_json);
                 let tokens_offset = tokens_json.as_ptr() as u32;
                 #[expect(clippy::cast_possible_truncation)]
                 let tokens_len = tokens_json.len() as u32;
                 (tokens_offset, tokens_len)
-            }
-        } else {
-            (0, 0)
-        };
+            } else {
+                (0, 0)
+            };
 
         // Get offset of `Program` within buffer (bottom 32 bits of pointer)
         let program_offset = ptr::from_ref(program) as u32;
diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs
index b5d7cb7681045..700d2b3380846 100644
--- a/tasks/benchmark/benches/parser.rs
+++ b/tasks/benchmark/benches/parser.rs
@@ -143,6 +143,8 @@ fn bench_estree_tokens(criterion: &mut Criterion) {
                     let tokens_json = to_estree_tokens_json(
                         &tokens,
                         &program,
+                        program.source_text,
+                        &span_converter,
                         EstreeTokenOptions::test262(),
                         &allocator,
                     );
diff --git a/tasks/coverage/src/tools.rs b/tasks/coverage/src/tools.rs
index 500a3fa18ef10..d5b9b417fec5b 100644
--- a/tasks/coverage/src/tools.rs
+++ b/tasks/coverage/src/tools.rs
@@ -854,8 +854,14 @@ pub fn run_estree_test262_tokens(files: &[Test262File]) -> Vec<CoverageResult> {
             let span_converter = Utf8ToUtf16::new(source_text);
             span_converter.convert_program_with_ascending_order_checks(&mut program);
 
-            let oxc_tokens_json =
-                to_estree_tokens_json(&tokens, &program, EstreeTokenOptions::test262(), &allocator);
+            let oxc_tokens_json = to_estree_tokens_json(
+                &tokens,
+                &program,
+                source_text,
+                &span_converter,
+                EstreeTokenOptions::test262(),
+                &allocator,
+            );
 
             let token_path = workspace_root()
                 .join("estree-conformance/tests/test262-tokens")
@@ -898,8 +904,14 @@ pub fn run_estree_acorn_jsx_tokens(files: &[AcornJsxFile]) -> Vec<CoverageResult
             let span_converter = Utf8ToUtf16::new(source_text);
             span_converter.convert_program_with_ascending_order_checks(&mut program);
 
-            let oxc_tokens_json =
-                to_estree_tokens_json(&tokens, &program, EstreeTokenOptions::test262(), &allocator);
+            let oxc_tokens_json = to_estree_tokens_json(
+                &tokens,
+                &program,
+                source_text,
+                &span_converter,
+                EstreeTokenOptions::test262(),
+                &allocator,
+            );
 
             let token_path = workspace_root().join(f.path.with_extension("tokens.json"));
             let expected_tokens_json = fs::read_to_string(&token_path).unwrap_or_default();
@@ -1078,6 +1090,8 @@ pub fn run_estree_typescript_tokens(files: &[TypeScriptFile]) -> Vec<CoverageRes
                 let oxc_tokens_json = to_estree_tokens_json(
                     &tokens,
                     &program,
+                    source_text,
+                    &span_converter,
                     EstreeTokenOptions::typescript(),
                     &allocator,
                 );