diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06b25ba60fff9..dd73420399875 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -204,6 +204,7 @@ jobs: name: Run tests in workspace env: RUN_RAW_RANGE_TESTS: "true" + RUN_RAW_TOKENS_TESTS: "true" run: | rustup target add wasm32-wasip1-threads pnpm run build-test diff --git a/Cargo.lock b/Cargo.lock index d2bd722365cc4..d1d67c2386fe3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2200,6 +2200,7 @@ dependencies = [ "oxc", "oxc_ast_macros", "oxc_estree", + "oxc_estree_tokens", "oxc_napi", "rustc-hash", ] diff --git a/napi/parser/Cargo.toml b/napi/parser/Cargo.toml index dada9a0e29b83..421b0876abc84 100644 --- a/napi/parser/Cargo.toml +++ b/napi/parser/Cargo.toml @@ -25,6 +25,7 @@ doctest = false oxc = { workspace = true, features = ["ast_visit", "regular_expression", "semantic", "serialize"] } oxc_ast_macros = { workspace = true } oxc_estree = { workspace = true } +oxc_estree_tokens = { workspace = true } oxc_napi = { workspace = true } rustc-hash = { workspace = true } diff --git a/napi/parser/src-js/raw-transfer/eager.js b/napi/parser/src-js/raw-transfer/eager.js index 2c635e3bbf6cc..c9a9428064d09 100644 --- a/napi/parser/src-js/raw-transfer/eager.js +++ b/napi/parser/src-js/raw-transfer/eager.js @@ -1,4 +1,5 @@ import { createRequire } from "node:module"; +import { TOKENS_OFFSET_POS_32, TOKENS_LEN_POS_32 } from "../generated/constants.js"; import { isJsAst, parseAsyncRawImpl, parseSyncRawImpl, returnBufferToCache } from "./common.js"; const require = createRequire(import.meta.url); @@ -97,11 +98,34 @@ function deserialize(buffer, sourceText, sourceByteLen, options) { } } + // Deserialize tokens + const tokens = options.experimentalTokens ? deserializeTokens(buffer, sourceText, isJs) : null; + // Return buffer to cache, to be reused returnBufferToCache(buffer); // We cannot lazily deserialize in the getters, because the buffer might be re-used to parse // another file before the getter is called + if (tokens !== null) { + return { + get program() { + return data.program; + }, + get module() { + return data.module; + }, + get comments() { + return data.comments; + }, + get tokens() { + return tokens; + }, + get errors() { + return data.errors; + }, + }; + } + return { get program() { return data.program; @@ -117,3 +141,109 @@ function deserialize(buffer, sourceText, sourceByteLen, options) { }, }; } + +// `ESTreeKind` discriminants (set by Rust side) +const PRIVATE_IDENTIFIER_KIND = 2; +const REGEXP_KIND = 8; + +// Indexed by `ESTreeKind` discriminant (matches `ESTreeKind` enum in `estree_kind.rs`) +const TOKEN_TYPES = [ + "Identifier", + "Keyword", + "PrivateIdentifier", + "Punctuator", + "Numeric", + "String", + "Boolean", + "Null", + "RegularExpression", + "Template", + "JSXText", + "JSXIdentifier", +]; + +// Details of Rust `Token` type +const TOKEN_SIZE = 16; +const KIND_FIELD_OFFSET = 8; +const IS_ESCAPED_FIELD_OFFSET = 10; + +/** + * Deserialize tokens from buffer. + * @param {Uint8Array} buffer - Buffer containing AST in raw form + * @param {string} sourceText - Source for the file + * @param {boolean} isJs - `true` if parsing in JS mode + * @returns {Object[]} - Array of token objects + */ +function deserializeTokens(buffer, sourceText, isJs) { + const { uint32 } = buffer; + + let pos = uint32[TOKENS_OFFSET_POS_32]; + const len = uint32[TOKENS_LEN_POS_32]; + const endPos = pos + len * TOKEN_SIZE; + + const tokens = []; + while (pos < endPos) { + tokens.push(deserializeToken(pos, buffer, sourceText, isJs)); + pos += TOKEN_SIZE; + } + return tokens; +} + +/** + * Deserialize a token from buffer at position `pos`. + * @param {number} pos - Position in buffer containing Rust `Token` type + * @param {Uint8Array} buffer - Buffer containing AST in raw form + * @param {string} sourceText - Source for the file + * @param {boolean} isJs - `true` if parsing in JS mode + * @returns {Object} - Token object + */ +function deserializeToken(pos, buffer, sourceText, isJs) { + const { uint32 } = buffer; + + const pos32 = pos >> 2; + const start = uint32[pos32], + end = uint32[pos32 + 1]; + + let value = sourceText.slice(start, end); + + const kind = buffer[pos + KIND_FIELD_OFFSET]; + + if (kind === REGEXP_KIND) { + const patternEnd = value.lastIndexOf("/"); + return { + type: "RegularExpression", + value, + regex: { + pattern: value.slice(1, patternEnd), + flags: value.slice(patternEnd + 1), + }, + start, + end, + }; + } + + // Strip leading `#` from private identifiers + if (kind === PRIVATE_IDENTIFIER_KIND) value = value.slice(1); + + // Unescape identifiers, keywords, and private identifiers in JS mode + if (isJs && kind <= PRIVATE_IDENTIFIER_KIND && buffer[pos + IS_ESCAPED_FIELD_OFFSET] === 1) { + value = unescapeIdentifier(value); + } + + return { type: TOKEN_TYPES[kind], value, start, end }; +} + +/** + * Unescape an identifier. + * + * We do this on JS side, because escaped identifiers are so extremely rare that this function + * is never called in practice anyway. + * + * @param {string} name - Identifier name to unescape + * @returns {string} - Unescaped identifier name + */ +function unescapeIdentifier(name) { + return name.replace(/\\u(?:\{([0-9a-fA-F]+)\}|([0-9a-fA-F]{4}))/g, (_, hex1, hex2) => + String.fromCodePoint(parseInt(hex1 ?? hex2, 16)), + ); +} diff --git a/napi/parser/src/lib.rs b/napi/parser/src/lib.rs index 8eea50f084509..fc7d02d2cc294 100644 --- a/napi/parser/src/lib.rs +++ b/napi/parser/src/lib.rs @@ -5,7 +5,7 @@ use napi_derive::napi; use oxc::{ allocator::Allocator, - parser::{ParseOptions, Parser, ParserReturn}, + parser::{ParseOptions, Parser, ParserReturn, config::RuntimeParserConfig}, semantic::SemanticBuilder, span::SourceType, }; @@ -85,6 +85,7 @@ fn parse_impl<'a>( preserve_parens: options.preserve_parens.unwrap_or(true), ..ParseOptions::default() }) + .with_config(RuntimeParserConfig::new(options.tokens.unwrap_or(false))) .parse() } diff --git a/napi/parser/src/raw_transfer.rs b/napi/parser/src/raw_transfer.rs index aff87c3810607..461aebf590775 100644 --- a/napi/parser/src/raw_transfer.rs +++ b/napi/parser/src/raw_transfer.rs @@ -15,6 +15,7 @@ use oxc::{ ast_visit::utf8_to_utf16::Utf8ToUtf16, semantic::SemanticBuilder, }; +use oxc_estree_tokens::{ESTreeTokenOptions, update_tokens}; use oxc_napi::get_source_type; use crate::{ @@ -214,9 +215,9 @@ unsafe fn parse_raw_impl( let options = options.unwrap_or_default(); let source_type = get_source_type(filename, options.lang.as_deref(), options.source_type.as_deref()); - let ast_type = get_ast_type(source_type, &options); + let is_ts = get_ast_type(source_type, &options) == AstType::TypeScript; - let data_ptr = { + let (data_offset, tokens_offset, tokens_len) = { // SAFETY: We checked above that `source_len` does not exceed length of buffer let source_text = unsafe { buffer.get_unchecked(..source_len) }; // SAFETY: Caller guarantees source occupies this region of the buffer and is valid UTF-8 @@ -251,8 +252,22 @@ unsafe fn parse_raw_impl( ArenaVec::new_in(&allocator) }; - // Convert spans to UTF-16 + // Convert tokens let span_converter = Utf8ToUtf16::new(source_text); + + let (tokens_offset, tokens_len) = if options.tokens == Some(true) { + let mut tokens = ret.tokens; + update_tokens(&mut tokens, &program, &span_converter, ESTreeTokenOptions::new(is_ts)); + + let tokens_offset = tokens.as_ptr() as u32; + #[expect(clippy::cast_possible_truncation)] + let tokens_len = tokens.len() as u32; + (tokens_offset, tokens_len) + } else { + (0, 0) + }; + + // Convert spans to UTF-16 span_converter.convert_program(&mut program); span_converter.convert_comments(&mut comments); span_converter.convert_module_record(&mut module_record); @@ -270,12 +285,13 @@ unsafe fn parse_raw_impl( // Write `RawTransferData` to arena, and return pointer to it let data = RawTransferData { program, comments, module, errors }; let data = allocator.alloc(data); - ptr::from_ref(data).cast::() + let data_offset = ptr::from_ref(data).cast::() as u32; + + (data_offset, tokens_offset, tokens_len) }; // Write metadata into end of buffer - #[allow(clippy::cast_possible_truncation)] - let metadata = RawTransferMetadata::new(data_ptr as u32, ast_type == AstType::TypeScript); + let metadata = RawTransferMetadata::new(data_offset, is_ts, tokens_offset, tokens_len); const RAW_METADATA_OFFSET: usize = BUFFER_SIZE - RAW_METADATA_SIZE; const _: () = assert!(RAW_METADATA_OFFSET.is_multiple_of(BUMP_ALIGN)); // SAFETY: `RAW_METADATA_OFFSET` is less than length of `buffer`. diff --git a/napi/parser/src/raw_transfer_types.rs b/napi/parser/src/raw_transfer_types.rs index 6a7e6f28f3a15..1475f03d33051 100644 --- a/napi/parser/src/raw_transfer_types.rs +++ b/napi/parser/src/raw_transfer_types.rs @@ -49,8 +49,8 @@ pub struct RawTransferMetadata { } impl RawTransferMetadata { - pub fn new(data_offset: u32, is_ts: bool) -> Self { - Self { data_offset, is_ts, is_jsx: false, has_bom: false, tokens_offset: 0, tokens_len: 0 } + pub fn new(data_offset: u32, is_ts: bool, tokens_offset: u32, tokens_len: u32) -> Self { + Self { data_offset, is_ts, is_jsx: false, has_bom: false, tokens_offset, tokens_len } } } diff --git a/napi/parser/src/types.rs b/napi/parser/src/types.rs index ec9f402fff2dc..443dd954286c0 100644 --- a/napi/parser/src/types.rs +++ b/napi/parser/src/types.rs @@ -31,6 +31,14 @@ pub struct ParserOptions { #[napi(ts_type = "boolean")] pub range: Option, + /// Controls whether parser should return tokens. + /// + /// This option is not stable yet, and only available with experimental raw transfer. + /// + /// @default false + #[napi(skip_typescript, js_name = "experimentalTokens")] + pub tokens: Option, + /// Emit `ParenthesizedExpression` and `TSParenthesizedType` in AST. /// /// If this option is true, parenthesized expressions are represented by diff --git a/napi/parser/test/parse-raw-common.ts b/napi/parser/test/parse-raw-common.ts index 392697ac25a8e..c46be27b52b94 100644 --- a/napi/parser/test/parse-raw-common.ts +++ b/napi/parser/test/parse-raw-common.ts @@ -10,8 +10,9 @@ export const TEST_TYPE_INLINE_FIXTURE = 4; export const TEST_TYPE_MAIN_MASK = 7; export const TEST_TYPE_RANGE_PARENT = 8; -export const TEST_TYPE_LAZY = 16; -export const TEST_TYPE_PRETTY = 32; +export const TEST_TYPE_TOKENS = 16; +export const TEST_TYPE_LAZY = 32; +export const TEST_TYPE_PRETTY = 64; export const ROOT_DIR_PATH = pathJoin(import.meta.dirname, "../../.."); export const TARGET_DIR_PATH = pathJoin(ROOT_DIR_PATH, "target"); @@ -23,6 +24,10 @@ export const ACORN_TEST262_DIR_PATH = pathJoin( ROOT_DIR_PATH, "tasks/coverage/estree-conformance/tests/test262/test", ); +export const ACORN_TEST262_TOKENS_DIR_PATH = pathJoin( + ROOT_DIR_PATH, + "tasks/coverage/estree-conformance/tests/test262-tokens/test", +); export const JSX_SHORT_DIR_PATH = "tasks/coverage/estree-conformance/tests/acorn-jsx/pass"; export const JSX_DIR_PATH = pathJoin(ROOT_DIR_PATH, JSX_SHORT_DIR_PATH); const TS_ESTREE_SHORT_DIR_PATH = "tasks/coverage/estree-conformance/tests/typescript"; diff --git a/napi/parser/test/parse-raw-worker.ts b/napi/parser/test/parse-raw-worker.ts index c9a0f2ca85050..413e54fe545d7 100644 --- a/napi/parser/test/parse-raw-worker.ts +++ b/napi/parser/test/parse-raw-worker.ts @@ -6,6 +6,7 @@ import { basename, join as pathJoin } from "node:path"; import { parseSync } from "./parser.ts"; import { ACORN_TEST262_DIR_PATH, + ACORN_TEST262_TOKENS_DIR_PATH, JSX_DIR_PATH, ROOT_DIR_PATH, TEST262_DIR_PATH, @@ -16,6 +17,7 @@ import { TEST_TYPE_MAIN_MASK, TEST_TYPE_PRETTY, TEST_TYPE_RANGE_PARENT, + TEST_TYPE_TOKENS, TEST_TYPE_TEST262, TEST_TYPE_TS, TS_DIR_PATH, @@ -46,19 +48,20 @@ export async function runCase( expect: ExpectFunction, ): Promise { const rangeParent = (type & TEST_TYPE_RANGE_PARENT) !== 0, + tokens = (type & TEST_TYPE_TOKENS) !== 0, lazy = (type & TEST_TYPE_LAZY) !== 0, pretty = (type & TEST_TYPE_PRETTY) !== 0; type &= TEST_TYPE_MAIN_MASK; switch (type) { case TEST_TYPE_TEST262: - await runTest262Case(props as string, rangeParent, lazy, expect); + await runTest262Case(props as string, rangeParent, tokens, lazy, expect); break; case TEST_TYPE_JSX: - await runJsxCase(props as string, rangeParent, lazy, expect); + await runJsxCase(props as string, rangeParent, tokens, lazy, expect); break; case TEST_TYPE_TS: - await runTsCase(props as string, rangeParent, lazy, expect); + await runTsCase(props as string, rangeParent, tokens, lazy, expect); break; case TEST_TYPE_FIXTURE: await runFixture(props as string, rangeParent, lazy, pretty, expect); @@ -81,6 +84,7 @@ export async function runCase( async function runTest262Case( path: string, rangeParent: boolean, + tokens: boolean, lazy: boolean, expect: ExpectFunction, ): Promise { @@ -96,6 +100,21 @@ async function runTest262Case( testRangeParent(filename, sourceText, { sourceType }, expect); return; } + + if (tokens) { + // Some fixtures have no tokens JSON file because Espree can't parse them. Skip them. + let expectedJson: string; + try { + expectedJson = await readFile(pathJoin(ACORN_TEST262_TOKENS_DIR_PATH, `${path}on`), "utf8"); + } catch (err) { + if ((err as NodeJS.ErrnoException)?.code !== "ENOENT") throw err; + return; + } + + testTokens(filename, sourceText, expectedJson, { sourceType }, expect); + return; + } + if (lazy) { testLazy(filename, sourceText, { sourceType }); return; @@ -113,6 +132,7 @@ async function runTest262Case( async function runJsxCase( filename: string, rangeParent: boolean, + tokens: boolean, lazy: boolean, expect: ExpectFunction, ): Promise { @@ -129,6 +149,14 @@ async function runJsxCase( testRangeParent(filename, sourceText, { sourceType }, expect); return; } + + if (tokens) { + const tokensJsonPath = sourcePath.slice(0, -3) + "tokens.json"; // `.jsx` -> `.tokens.json` + const expectedJson = await readFile(tokensJsonPath, "utf8"); + testTokens(filename, sourceText, expectedJson, { sourceType }, expect); + return; + } + if (lazy) { testLazy(filename, sourceText, { sourceType }); return; @@ -146,12 +174,15 @@ async function runJsxCase( const TS_CASE_HEADER = "__ESTREE_TEST__:"; const TS_CASE_HEADER_AST = "AST:\n```json\n"; const TS_CASE_HEADER_AST_LEN = TS_CASE_HEADER_AST.length; +const TS_CASE_HEADER_TOKENS = "TOKENS:\n```json\n"; +const TS_CASE_HEADER_TOKENS_LEN = TS_CASE_HEADER_TOKENS.length; const TS_CASE_FOOTER = "\n```\n"; const TS_CASE_FOOTER_LEN = TS_CASE_FOOTER.length; async function runTsCase( path: string, rangeParent: boolean, + tokens: boolean, lazy: boolean, expect: ExpectFunction, ): Promise { @@ -166,10 +197,15 @@ async function runTsCase( const { tests } = makeUnitsFromTest(tsPath, sourceText); - const estreeJsons = []; + const estreeJsons = [], + tokensJsons = []; for (const part of casesJson.split(TS_CASE_HEADER).slice(1)) { if (part.startsWith(TS_CASE_HEADER_AST)) { estreeJsons.push(part.slice(TS_CASE_HEADER_AST_LEN, -TS_CASE_FOOTER_LEN)); + } else if (part.startsWith(TS_CASE_HEADER_TOKENS)) { + tokensJsons.push(part.slice(TS_CASE_HEADER_TOKENS_LEN, -TS_CASE_FOOTER_LEN)); + } else { + throw new Error("Unexpected test type"); } } @@ -189,6 +225,35 @@ async function runTsCase( testRangeParent(filename, code, options, expect); continue; } + + if (tokens) { + // We can fail to match the TS-ESLint snapshots where there are syntax errors, + // because our parser is not recoverable. + // When fatal error, parser will return an empty program. + // If a test fails, check that a fatal parsing error is the cause, and ignore it if so. + try { + testTokens(filename, code, tokensJsons[i], options, expect); + } catch (err) { + const { program, errors } = parseSync(filename, code, { + ...options, + experimentalRawTransfer: false, + }); + + if ( + errors.length > 0 && + program.start === 0 && + program.end === 0 && + program.body.length === 0 + ) { + // Fatal error + continue; + } + + throw err; + } + continue; + } + if (lazy) { testLazy(filename, code, options); continue; @@ -321,6 +386,25 @@ function testRangeParent( walk(ret.program); } +// Test deserialized tokens match expected JSON. +function testTokens( + filename: string, + sourceText: string, + expectedJson: string, + options: ParserOptions | null, + expect: ExpectFunction, +): void { + const ret = parseSync(filename, sourceText, { + ...options, + experimentalRawTransfer: true, + experimentalTokens: true, + }); + + const { tokens } = ret as any; + const tokensJson = JSON.stringify(tokens, null, 2); + expect(tokensJson).toEqual(expectedJson); +} + // Test lazy deserialization does not throw an error. // We don't test the correctness of the output. function testLazy(filename: string, sourceText: string, options: ParserOptions | null): void { diff --git a/napi/parser/test/parse-raw.test.ts b/napi/parser/test/parse-raw.test.ts index 5da3663535592..8656e399f2dda 100644 --- a/napi/parser/test/parse-raw.test.ts +++ b/napi/parser/test/parse-raw.test.ts @@ -21,6 +21,7 @@ import { TEST_TYPE_LAZY, TEST_TYPE_PRETTY, TEST_TYPE_RANGE_PARENT, + TEST_TYPE_TOKENS, TEST_TYPE_TEST262, TEST_TYPE_TS, TS_ESTREE_DIR_PATH, @@ -38,6 +39,10 @@ const [describeRangeParent, itRangeParent] = isEnabled(env.RUN_RAW_RANGE_TESTS) ? [describe, it] : ((noop) => [noop, noop])(Object.assign(() => {}, { concurrent() {} })); +const [describeTokens, _itTokens] = isEnabled(env.RUN_RAW_TOKENS_TESTS) + ? [describe, it] + : ((noop) => [noop, noop])(Object.assign(() => {}, { concurrent() {} })); + const [describeLazy, itLazy] = isEnabled(env.RUN_LAZY_TESTS) ? [describe, it] : ((noop) => [noop, noop])(Object.assign(() => {}, { concurrent() {} })); @@ -133,6 +138,13 @@ describeRangeParent.concurrent("range & parent test262", () => { ); }); +describeTokens.concurrent("tokens test262", () => { + // oxlint-disable-next-line jest/expect-expect + it.each(test262FixturePaths)("%s", (path) => + runCaseInWorker(TEST_TYPE_TEST262 | TEST_TYPE_TOKENS, path), + ); +}); + // Check lazy deserialization doesn't throw describeLazy.concurrent("lazy test262", () => { // oxlint-disable-next-line jest/expect-expect @@ -162,6 +174,11 @@ describeRangeParent.concurrent("range & parent JSX", () => { ); }); +describeTokens.concurrent("tokens JSX", () => { + // oxlint-disable-next-line jest/expect-expect + it.each(jsxFixturePaths)("%s", (path) => runCaseInWorker(TEST_TYPE_JSX | TEST_TYPE_TOKENS, path)); +}); + // Check lazy deserialization doesn't throw describeLazy.concurrent("lazy JSX", () => { // oxlint-disable-next-line jest/expect-expect @@ -195,6 +212,11 @@ describeRangeParent.concurrent("range & parent TypeScript", () => { ); }); +describeTokens.concurrent("tokens TypeScript", () => { + // oxlint-disable-next-line jest/expect-expect + it.each(tsFixturePaths)("%s", (path) => runCaseInWorker(TEST_TYPE_TS | TEST_TYPE_TOKENS, path)); +}); + // Check lazy deserialization doesn't throw describeLazy.concurrent("lazy TypeScript", () => { // oxlint-disable-next-line jest/expect-expect diff --git a/napi/parser/test/parser.ts b/napi/parser/test/parser.ts index 2e124a072d3d2..8c0d4280113b6 100644 --- a/napi/parser/test/parser.ts +++ b/napi/parser/test/parser.ts @@ -12,6 +12,7 @@ export type * from "../src-js/index.js"; interface ExperimentalParserOptions { experimentalRawTransfer?: boolean; experimentalParent?: boolean; + experimentalTokens?: boolean; experimentalLazy?: boolean; } diff --git a/napi/parser/vitest.config.ts b/napi/parser/vitest.config.ts index fa78329b1f929..854766e3ad3ca 100644 --- a/napi/parser/vitest.config.ts +++ b/napi/parser/vitest.config.ts @@ -5,7 +5,10 @@ const isEnabled = (envValue) => envValue === "true" || envValue === "1"; const runLazyTests = isEnabled(env.RUN_LAZY_TESTS); let runRawTests = - runLazyTests || isEnabled(env.RUN_RAW_TESTS) || isEnabled(env.RUN_RAW_RANGE_TESTS); + runLazyTests || + isEnabled(env.RUN_RAW_TESTS) || + isEnabled(env.RUN_RAW_RANGE_TESTS) || + isEnabled(env.RUN_RAW_TOKENS_TESTS); // Raw tests use `tinypool`, which doesn't seem to work on Windows with Vitest // Ref: https://github.com/vitest-dev/vitest/issues/8201