diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index c08957cddede4..00352051df9dc 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -8,6 +8,8 @@ use oxc_syntax::identifier::{ use crate::diagnostics; +use oxc_span::IncrementalIdentHasher; + use super::{ Kind, Lexer, SourcePosition, cold_branch, search::{SafeByteMatchTable, byte_search, safe_byte_match_table}, @@ -48,16 +50,26 @@ impl<'a> Lexer<'a> { /// * `self.source` must not be exhausted (at least 1 char remaining). /// * Next char must be ASCII. pub(super) unsafe fn identifier_name_handler(&mut self) -> &'a str { + // Reset hasher and hash the first byte. + // SAFETY: Caller guarantees not at EOF. + let first_byte = unsafe { self.source.position().read() }; + self.identifier_hasher = IncrementalIdentHasher::new(); + self.identifier_hasher.write_byte(first_byte); + // Advance past 1st byte. // SAFETY: Caller guarantees not at EOF, and next byte is ASCII. let after_first = unsafe { self.source.position().add(1) }; - // Consume bytes which are part of identifier + // Consume bytes which are part of identifier, hashing as we go let next_byte = byte_search! { lexer: self, table: NOT_ASCII_ID_CONTINUE_TABLE, start: after_first, + hash_identifier: true, handle_eof: { + // Hash remaining bytes before returning + let remaining = self.source.str_from_pos_to_current(after_first); + self.identifier_hasher.write_bytes(remaining.as_bytes()); // Return identifier minus its first char. // SAFETY: `lexer.source` is positioned at EOF, so there is no valid value // of `after_first` which could be after current position. @@ -74,7 +86,7 @@ impl<'a> Lexer<'a> { // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 // makes `start_pos` `source`'s position as it was at start of this function let start_pos = unsafe { after_first.sub(1) }; - &self.identifier_tail_unicode(start_pos)[1..] + self.identifier_tail_unicode_with_hash(start_pos) }); } if next_byte == b'\\' { @@ -82,7 +94,7 @@ impl<'a> Lexer<'a> { // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 // makes `start_pos` `source`'s position as it was at start of this function let start_pos = unsafe { after_first.sub(1) }; - &self.identifier_backslash(start_pos, false)[1..] + self.identifier_backslash_with_hash(start_pos) }); } @@ -93,6 +105,29 @@ impl<'a> Lexer<'a> { unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } } + /// Handle rest of identifier after first byte of a multi-byte Unicode char found. + /// Continues hashing from current position. Returns identifier minus its first char. + fn identifier_tail_unicode_with_hash(&mut self, start_pos: SourcePosition<'a>) -> &'a str { + // Save position - bytes before this were already hashed by byte_search + let hash_start = self.source.position(); + let id = self.identifier_tail_unicode(start_pos); + // Hash only the new bytes (from unicode char onwards) + let new_bytes = self.source.str_from_pos_to_current(hash_start); + self.identifier_hasher.write_bytes(new_bytes.as_bytes()); + &id[1..] + } + + /// Handle rest of identifier after a `\` escape is found. + /// Must recompute hash because escape sequences decode to different bytes. + /// Returns identifier minus its first char. + fn identifier_backslash_with_hash(&mut self, start_pos: SourcePosition<'a>) -> &'a str { + let id = self.identifier_backslash(start_pos, false); + // Must recompute: source has `\u0041` but string has `A` + self.identifier_hasher = IncrementalIdentHasher::new(); + self.identifier_hasher.write_bytes(id.as_bytes()); + &id[1..] + } + /// Handle rest of identifier after first byte of a multi-byte Unicode char found. /// Any number of characters can have already been consumed from `self.source` prior to it. /// `self.source` should be positioned at start of Unicode character. @@ -140,6 +175,11 @@ impl<'a> Lexer<'a> { // Process escape and get rest of identifier let id = self.identifier_on_backslash(str, true); + + // Hash the unescaped identifier + self.identifier_hasher = IncrementalIdentHasher::new(); + self.identifier_hasher.write_bytes(id.as_bytes()); + Kind::match_keyword(id) } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 3a98ade077660..c30311428944b 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -10,7 +10,7 @@ use rustc_hash::FxHashMap; use oxc_allocator::Allocator; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::OxcDiagnostic; -use oxc_span::{SourceType, Span}; +use oxc_span::{IncrementalIdentHasher, SourceType, Span}; use crate::{UniquePromise, diagnostics}; @@ -47,6 +47,7 @@ pub struct LexerCheckpoint<'a> { errors_snapshot: ErrorSnapshot, has_pure_comment: bool, has_no_side_effects_comment: bool, + identifier_hasher: IncrementalIdentHasher, } #[derive(Debug, Clone)] @@ -95,6 +96,10 @@ pub struct Lexer<'a> { /// `memchr` Finder for end of multi-line comments. Created lazily when first used. multi_line_comment_end_finder: Option>, + + /// Incremental hasher for current identifier. + /// Used to compute hash during lexing for efficient `Ident` creation. + pub(crate) identifier_hasher: IncrementalIdentHasher, } impl<'a> Lexer<'a> { @@ -124,6 +129,7 @@ impl<'a> Lexer<'a> { escaped_strings: FxHashMap::default(), escaped_templates: FxHashMap::default(), multi_line_comment_end_finder: None, + identifier_hasher: IncrementalIdentHasher::new(), } } @@ -165,6 +171,7 @@ impl<'a> Lexer<'a> { errors_snapshot, has_pure_comment: self.trivia_builder.has_pure_comment, has_no_side_effects_comment: self.trivia_builder.has_no_side_effects_comment, + identifier_hasher: self.identifier_hasher, } } @@ -182,6 +189,7 @@ impl<'a> Lexer<'a> { errors_snapshot, has_pure_comment: self.trivia_builder.has_pure_comment, has_no_side_effects_comment: self.trivia_builder.has_no_side_effects_comment, + identifier_hasher: self.identifier_hasher, } } @@ -196,6 +204,7 @@ impl<'a> Lexer<'a> { self.token = checkpoint.token; self.trivia_builder.has_pure_comment = checkpoint.has_pure_comment; self.trivia_builder.has_no_side_effects_comment = checkpoint.has_no_side_effects_comment; + self.identifier_hasher = checkpoint.identifier_hasher; } pub fn peek_token(&mut self) -> Token { diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index 1430e4c83c98d..a5d7d2bfcb115 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -414,6 +414,29 @@ macro_rules! byte_search { } }; + // With provided `start` position and identifier hashing. + // Delegates to main implementation, then hashes the scanned bytes. + ( + lexer: $lexer:ident, + table: $table:ident, + start: $start:ident, + hash_identifier: true, + handle_eof: $eof_handler:expr, + ) => {{ + let hash_start = $start; + let result = byte_search! { + lexer: $lexer, + table: $table, + start: $start, + continue_if: (byte, pos) false, + handle_eof: $eof_handler, + }; + // Hash the bytes that were scanned (from start to current position) + let scanned = $lexer.source.str_from_pos_to_current(hash_start); + $lexer.identifier_hasher.write_bytes(scanned.as_bytes()); + result + }}; + // Actual implementation - with both `start` and `continue_if` ( lexer: $lexer:ident, diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index ded5fcbe31855..a3d5985d790ac 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -277,6 +277,8 @@ impl<'a> Lexer<'a> { /// Get the current identifier with precomputed hash. #[inline] pub(crate) fn get_ident(&self, token: Token) -> Ident<'a> { - Ident::new(self.get_string(token)) + let s = self.get_string(token); + let hash = self.identifier_hasher.finish(); + Ident::new_with_hash(s, hash) } } diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 6e2dbf7e4fd82..36d6a161b2157 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -12,6 +12,8 @@ use oxc_syntax::{ line_terminator::{CR, LF, LS, PS, is_irregular_line_terminator}, }; +use oxc_span::IncrementalIdentHasher; + use super::{Kind, Lexer, Span}; /// A Unicode escape sequence. @@ -36,7 +38,10 @@ impl<'a> Lexer<'a> { c if is_identifier_start_unicode(c) => { let start_pos = self.source.position(); self.consume_char(); - self.identifier_tail_after_unicode(start_pos); + let id = self.identifier_tail_after_unicode(start_pos); + // Hash the full identifier for get_ident() + self.identifier_hasher = IncrementalIdentHasher::new(); + self.identifier_hasher.write_bytes(id.as_bytes()); Kind::Ident } c if is_irregular_whitespace(c) => self.handle_irregular_whitespace(c), diff --git a/crates/oxc_span/src/lib.rs b/crates/oxc_span/src/lib.rs index d6ce862f2deb9..73f6d76995c88 100644 --- a/crates/oxc_span/src/lib.rs +++ b/crates/oxc_span/src/lib.rs @@ -11,7 +11,8 @@ mod span; pub use cmp::ContentEq; pub use oxc_str::{ ArenaIdentHashMap, Atom, CompactStr, Ident, IdentHashMap, IdentHashSet, IdentHasher, - MAX_INLINE_LEN as ATOM_MAX_INLINE_LEN, format_atom, format_compact_str, format_ident, + IncrementalIdentHasher, MAX_INLINE_LEN as ATOM_MAX_INLINE_LEN, format_atom, format_compact_str, + format_ident, }; pub use source_type::{ FileExtension, Language, LanguageVariant, ModuleKind, SourceType, UnknownExtension,