From fdfec2112863ab2a2e994433f7be69886dad4bef Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:33:20 +0000 Subject: [PATCH] refactor(lexer): simplify byte handler macros (#13057) Pure refactor. Simplify the byte handler macros in lexer in 3 ways: #### 1. Remove the `const BLAH: ByteHandler = { ... };` wrappers from generated code I don't think they're necessary, and I'm not sure why I put them there in the first place. Before: ```rs const UNI: ByteHandler = { #[expect(non_snake_case)] fn UNI(lexer: &mut Lexer) -> Kind { lexer.unicode_char_handler() } UNI }; ``` After: ```rs #[expect(non_snake_case)] fn UNI(lexer: &mut Lexer) -> Kind { lexer.unicode_char_handler() } ``` Each byte handler still has a useful name in CodSpeed flame graphs after this change. #### 2. Remove the `byte_handler!` macro. `ascii_byte_handler!` and `ascii_identifier_handler!` macros remain, but they no longer use `byte_handler!` macro internally. Removing the macro-within-macro pattern may make this code easier for AI to understand (and probably ditto for humans!). #### 3. Shorten macro expansion Hoist `use oxc_data_structures::assert_unchecked;` to top of file, so it doesn't need to be repeated in each macro expansion. --- crates/oxc_parser/src/lexer/byte_handlers.rs | 112 +++++++------------ 1 file changed, 38 insertions(+), 74 deletions(-) diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index e2c0b0a7853fa..901806bc7ad19 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -1,3 +1,5 @@ +use oxc_data_structures::assert_unchecked; + use crate::diagnostics; use super::{Kind, Lexer}; @@ -41,50 +43,14 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // F ]; -/// Macro for defining a byte handler. -/// -/// Use `ascii_byte_handler!` macro for ASCII characters, which adds optimizations for ASCII. -/// -/// Handlers are defined as functions instead of closures, so they have names in flame graphs. -/// -/// ``` -/// byte_handler!(UNI(lexer) { -/// lexer.unicode_char_handler() -/// }); -/// ``` -/// -/// expands to: -/// -/// ``` -/// const UNI: ByteHandler = { -/// #[expect(non_snake_case)] -/// fn UNI(lexer: &mut Lexer) -> Kind { -/// lexer.unicode_char_handler() -/// } -/// UNI -/// }; -/// ``` -macro_rules! byte_handler { - ($id:ident($lex:ident) $body:expr) => { - const $id: ByteHandler = { - #[expect(non_snake_case)] - fn $id($lex: &mut Lexer) -> Kind { - $body - } - $id - }; - }; -} - /// Macro for defining byte handler for an ASCII character. /// -/// In addition to defining a `const` for the handler, it also asserts that lexer -/// is not at end of file, and that next char is ASCII. +/// Asserts that lexer is not at end of file, and that next char is ASCII. /// Where the handler is for an ASCII character, these assertions are self-evidently true. /// /// These assertions produce no runtime code, but hint to the compiler that it can assume that /// next char is ASCII, and it uses that information to optimize the rest of the handler. -/// e.g. `lexer.consume_char()` becomes just a single assembler instruction. +/// e.g. `lexer.consume_char()` becomes just a single assembly instruction. /// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to /// the indirection of the `BYTE_HANDLERS` jump table. /// @@ -95,42 +61,38 @@ macro_rules! byte_handler { /// /// ``` /// ascii_byte_handler!(SPS(lexer) { -/// lexer.consume_char(); -/// Kind::WhiteSpace +/// lexer.consume_char(); +/// Kind::WhiteSpace /// }); /// ``` /// /// expands to: /// /// ``` -/// const SPS: ByteHandler = { -/// #[expect(non_snake_case)] -/// fn SPS(lexer: &mut Lexer) { +/// #[expect(non_snake_case)] +/// fn SPS(lexer: &mut Lexer) { /// // SAFETY: This macro is only used for ASCII characters /// unsafe { -/// use oxc_data_structures::assert_unchecked; -/// assert_unchecked!(!lexer.source.is_eof()); -/// assert_unchecked!(lexer.source.peek_byte_unchecked() < 128); +/// assert_unchecked!(!lexer.source.is_eof()); +/// assert_unchecked!(lexer.source.peek_byte_unchecked() < 128); /// } /// { -/// lexer.consume_char(); -/// Kind::WhiteSpace +/// lexer.consume_char(); +/// Kind::WhiteSpace /// } -/// } -/// SPS -/// }; +/// } /// ``` macro_rules! ascii_byte_handler { ($id:ident($lex:ident) $body:expr) => { - byte_handler!($id($lex) { + #[expect(non_snake_case)] + fn $id($lex: &mut Lexer) -> Kind { // SAFETY: This macro is only used for ASCII characters unsafe { - use oxc_data_structures::assert_unchecked; assert_unchecked!(!$lex.source.is_eof()); assert_unchecked!($lex.source.peek_byte_unchecked() < 128); } $body - }); + } }; } @@ -148,36 +110,34 @@ macro_rules! ascii_byte_handler { /// /// ``` /// ascii_identifier_handler!(L_G(id_without_first_char) match id_without_first_char { -/// "et" => Kind::Get, -/// "lobal" => Kind::Global, -/// _ => Kind::Ident, +/// "et" => Kind::Get, +/// "lobal" => Kind::Global, +/// _ => Kind::Ident, /// }); /// ``` /// /// expands to: /// /// ``` -/// const L_G: ByteHandler = { -/// #[expect(non_snake_case)] -/// fn L_G(lexer: &mut Lexer) -> Kind { +/// #[expect(non_snake_case)] +/// fn L_G(lexer: &mut Lexer) -> Kind { /// // SAFETY: This macro is only used for ASCII characters /// let id_without_first_char = unsafe { lexer.identifier_name_handler() }; /// match id_without_first_char { -/// "et" => Kind::Get, -/// "lobal" => Kind::Global, -/// _ => Kind::Ident, +/// "et" => Kind::Get, +/// "lobal" => Kind::Global, +/// _ => Kind::Ident, /// } -/// } -/// L_G -/// }; +/// } /// ``` macro_rules! ascii_identifier_handler { ($id:ident($str:ident) $body:expr) => { - byte_handler!($id(lexer) { + #[expect(non_snake_case)] + fn $id(lexer: &mut Lexer) -> Kind { // SAFETY: This macro is only used for ASCII characters let $str = unsafe { lexer.identifier_name_handler() }; $body - }); + } }; } @@ -687,17 +647,21 @@ ascii_identifier_handler!(L_Y(id_without_first_char) match id_without_first_char }); // Non-ASCII characters. -// NB: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII chars. -byte_handler!(UNI(lexer) { +// +// Note: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII chars. +#[expect(non_snake_case)] +fn UNI(lexer: &mut Lexer) -> Kind { lexer.unicode_char_handler() -}); +} // UTF-8 continuation bytes (0x80 - 0xBF) (i.e. middle of a multi-byte UTF-8 sequence) // + and byte values which are not legal in UTF-8 strings (0xC0, 0xC1, 0xF5 - 0xFF). // `handle_byte()` should only be called with 1st byte of a valid UTF-8 character, // so something has gone wrong if we get here. // https://datatracker.ietf.org/doc/html/rfc3629 -// NB: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII bytes. -byte_handler!(UER(_lexer) { +// +// Note: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII bytes. +#[expect(non_snake_case)] +fn UER(_lexer: &mut Lexer) -> Kind { unreachable!(); -}); +}