Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use oxc_syntax::identifier::{

use crate::diagnostics;

use oxc_span::IncrementalIdentHasher;

use super::{
Kind, Lexer, SourcePosition, cold_branch,
search::{SafeByteMatchTable, byte_search, safe_byte_match_table},
Expand Down Expand Up @@ -48,16 +50,26 @@ impl<'a> Lexer<'a> {
/// * `self.source` must not be exhausted (at least 1 char remaining).
/// * Next char must be ASCII.
pub(super) unsafe fn identifier_name_handler(&mut self) -> &'a str {
// Reset hasher and hash the first byte.
// SAFETY: Caller guarantees not at EOF.
let first_byte = unsafe { self.source.position().read() };
self.identifier_hasher = IncrementalIdentHasher::new();
self.identifier_hasher.write_byte(first_byte);

// Advance past 1st byte.
// SAFETY: Caller guarantees not at EOF, and next byte is ASCII.
let after_first = unsafe { self.source.position().add(1) };

// Consume bytes which are part of identifier
// Consume bytes which are part of identifier, hashing as we go
let next_byte = byte_search! {
lexer: self,
table: NOT_ASCII_ID_CONTINUE_TABLE,
start: after_first,
hash_identifier: true,
handle_eof: {
// Hash remaining bytes before returning
let remaining = self.source.str_from_pos_to_current(after_first);
self.identifier_hasher.write_bytes(remaining.as_bytes());
// Return identifier minus its first char.
// SAFETY: `lexer.source` is positioned at EOF, so there is no valid value
// of `after_first` which could be after current position.
Expand All @@ -74,15 +86,15 @@ impl<'a> Lexer<'a> {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_tail_unicode(start_pos)[1..]
self.identifier_tail_unicode_with_hash(start_pos)
});
}
if next_byte == b'\\' {
return cold_branch(|| {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_backslash(start_pos, false)[1..]
self.identifier_backslash_with_hash(start_pos)
});
}

Expand All @@ -93,6 +105,29 @@ impl<'a> Lexer<'a> {
unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }
}

/// Handle rest of identifier after first byte of a multi-byte Unicode char found.
/// Continues hashing from current position. Returns identifier minus its first char.
fn identifier_tail_unicode_with_hash(&mut self, start_pos: SourcePosition<'a>) -> &'a str {
// Save position - bytes before this were already hashed by byte_search
let hash_start = self.source.position();
let id = self.identifier_tail_unicode(start_pos);
// Hash only the new bytes (from unicode char onwards)
let new_bytes = self.source.str_from_pos_to_current(hash_start);
self.identifier_hasher.write_bytes(new_bytes.as_bytes());
&id[1..]
}

/// Handle rest of identifier after a `\` escape is found.
/// Must recompute hash because escape sequences decode to different bytes.
/// Returns identifier minus its first char.
fn identifier_backslash_with_hash(&mut self, start_pos: SourcePosition<'a>) -> &'a str {
let id = self.identifier_backslash(start_pos, false);
// Must recompute: source has `\u0041` but string has `A`
self.identifier_hasher = IncrementalIdentHasher::new();
self.identifier_hasher.write_bytes(id.as_bytes());
&id[1..]
}

/// Handle rest of identifier after first byte of a multi-byte Unicode char found.
/// Any number of characters can have already been consumed from `self.source` prior to it.
/// `self.source` should be positioned at start of Unicode character.
Expand Down Expand Up @@ -140,6 +175,11 @@ impl<'a> Lexer<'a> {

// Process escape and get rest of identifier
let id = self.identifier_on_backslash(str, true);

// Hash the unescaped identifier
self.identifier_hasher = IncrementalIdentHasher::new();
self.identifier_hasher.write_bytes(id.as_bytes());

Kind::match_keyword(id)
}

Expand Down
11 changes: 10 additions & 1 deletion crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use rustc_hash::FxHashMap;
use oxc_allocator::Allocator;
use oxc_ast::ast::RegExpFlags;
use oxc_diagnostics::OxcDiagnostic;
use oxc_span::{SourceType, Span};
use oxc_span::{IncrementalIdentHasher, SourceType, Span};

use crate::{UniquePromise, diagnostics};

Expand Down Expand Up @@ -47,6 +47,7 @@ pub struct LexerCheckpoint<'a> {
errors_snapshot: ErrorSnapshot,
has_pure_comment: bool,
has_no_side_effects_comment: bool,
identifier_hasher: IncrementalIdentHasher,
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -95,6 +96,10 @@ pub struct Lexer<'a> {

/// `memchr` Finder for end of multi-line comments. Created lazily when first used.
multi_line_comment_end_finder: Option<memchr::memmem::Finder<'static>>,

/// Incremental hasher for current identifier.
/// Used to compute hash during lexing for efficient `Ident` creation.
pub(crate) identifier_hasher: IncrementalIdentHasher,
}

impl<'a> Lexer<'a> {
Expand Down Expand Up @@ -124,6 +129,7 @@ impl<'a> Lexer<'a> {
escaped_strings: FxHashMap::default(),
escaped_templates: FxHashMap::default(),
multi_line_comment_end_finder: None,
identifier_hasher: IncrementalIdentHasher::new(),
}
}

Expand Down Expand Up @@ -165,6 +171,7 @@ impl<'a> Lexer<'a> {
errors_snapshot,
has_pure_comment: self.trivia_builder.has_pure_comment,
has_no_side_effects_comment: self.trivia_builder.has_no_side_effects_comment,
identifier_hasher: self.identifier_hasher,
}
}

Expand All @@ -182,6 +189,7 @@ impl<'a> Lexer<'a> {
errors_snapshot,
has_pure_comment: self.trivia_builder.has_pure_comment,
has_no_side_effects_comment: self.trivia_builder.has_no_side_effects_comment,
identifier_hasher: self.identifier_hasher,
}
}

Expand All @@ -196,6 +204,7 @@ impl<'a> Lexer<'a> {
self.token = checkpoint.token;
self.trivia_builder.has_pure_comment = checkpoint.has_pure_comment;
self.trivia_builder.has_no_side_effects_comment = checkpoint.has_no_side_effects_comment;
self.identifier_hasher = checkpoint.identifier_hasher;
}

pub fn peek_token(&mut self) -> Token {
Expand Down
23 changes: 23 additions & 0 deletions crates/oxc_parser/src/lexer/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,29 @@ macro_rules! byte_search {
}
};

// With provided `start` position and identifier hashing.
// Delegates to main implementation, then hashes the scanned bytes.
(
lexer: $lexer:ident,
table: $table:ident,
start: $start:ident,
hash_identifier: true,
handle_eof: $eof_handler:expr,
) => {{
let hash_start = $start;
let result = byte_search! {
lexer: $lexer,
table: $table,
start: $start,
continue_if: (byte, pos) false,
handle_eof: $eof_handler,
};
// Hash the bytes that were scanned (from start to current position)
let scanned = $lexer.source.str_from_pos_to_current(hash_start);
$lexer.identifier_hasher.write_bytes(scanned.as_bytes());
result
}};

// Actual implementation - with both `start` and `continue_if`
(
lexer: $lexer:ident,
Expand Down
4 changes: 3 additions & 1 deletion crates/oxc_parser/src/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ impl<'a> Lexer<'a> {
/// Get the current identifier with precomputed hash.
#[inline]
pub(crate) fn get_ident(&self, token: Token) -> Ident<'a> {
Ident::new(self.get_string(token))
let s = self.get_string(token);
let hash = self.identifier_hasher.finish();
Ident::new_with_hash(s, hash)
}
}
7 changes: 6 additions & 1 deletion crates/oxc_parser/src/lexer/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ use oxc_syntax::{
line_terminator::{CR, LF, LS, PS, is_irregular_line_terminator},
};

use oxc_span::IncrementalIdentHasher;

use super::{Kind, Lexer, Span};

/// A Unicode escape sequence.
Expand All @@ -36,7 +38,10 @@ impl<'a> Lexer<'a> {
c if is_identifier_start_unicode(c) => {
let start_pos = self.source.position();
self.consume_char();
self.identifier_tail_after_unicode(start_pos);
let id = self.identifier_tail_after_unicode(start_pos);
// Hash the full identifier for get_ident()
self.identifier_hasher = IncrementalIdentHasher::new();
self.identifier_hasher.write_bytes(id.as_bytes());
Kind::Ident
}
c if is_irregular_whitespace(c) => self.handle_irregular_whitespace(c),
Expand Down
3 changes: 2 additions & 1 deletion crates/oxc_span/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ mod span;
pub use cmp::ContentEq;
pub use oxc_str::{
ArenaIdentHashMap, Atom, CompactStr, Ident, IdentHashMap, IdentHashSet, IdentHasher,
MAX_INLINE_LEN as ATOM_MAX_INLINE_LEN, format_atom, format_compact_str, format_ident,
IncrementalIdentHasher, MAX_INLINE_LEN as ATOM_MAX_INLINE_LEN, format_atom, format_compact_str,
format_ident,
};
pub use source_type::{
FileExtension, Language, LanguageVariant, ModuleKind, SourceType, UnknownExtension,
Expand Down
Loading