diff --git a/crates/oxc_traverse/src/context/mod.rs b/crates/oxc_traverse/src/context/mod.rs index 808736c003394..99166464c1010 100644 --- a/crates/oxc_traverse/src/context/mod.rs +++ b/crates/oxc_traverse/src/context/mod.rs @@ -21,6 +21,7 @@ mod bound_identifier; mod maybe_bound_identifier; mod reusable; mod scoping; +mod uid; use ancestry::PopToken; pub use ancestry::TraverseAncestry; pub use bound_identifier::BoundIdentifier; diff --git a/crates/oxc_traverse/src/context/scoping.rs b/crates/oxc_traverse/src/context/scoping.rs index 0ee3b4c6abd53..f3d414544e166 100644 --- a/crates/oxc_traverse/src/context/scoping.rs +++ b/crates/oxc_traverse/src/context/scoping.rs @@ -1,9 +1,6 @@ use std::str; -use itoa::Buffer as ItoaBuffer; -use rustc_hash::FxHashSet; - -use oxc_allocator::{Allocator, String as ArenaString, Vec as ArenaVec}; +use oxc_allocator::{Allocator, Vec as ArenaVec}; use oxc_ast::ast::*; use oxc_ast_visit::Visit; use oxc_semantic::{NodeId, Reference, Scoping}; @@ -16,6 +13,8 @@ use oxc_syntax::{ use crate::{BoundIdentifier, scopes_collector::ChildScopeCollector}; +use super::uid::UidGenerator; + /// Traverse scope context. /// /// Contains the scope tree and symbols table, and provides methods to access them. @@ -24,7 +23,7 @@ use crate::{BoundIdentifier, scopes_collector::ChildScopeCollector}; /// `walk_*` functions update this field when entering/exiting a scope. pub struct TraverseScoping<'a> { scoping: Scoping, - uid_names: Option>, + uid_generator: Option>, current_scope_id: ScopeId, current_hoist_scope_id: ScopeId, current_block_scope_id: ScopeId, @@ -281,89 +280,17 @@ impl<'a> TraverseScoping<'a> { /// /// Finds a unique variable name which does clash with any other variables used in the program. /// - /// Based on Babel's `scope.generateUid` logic. - /// - /// - /// # Differences from Babel - /// - /// This implementation aims to replicate Babel's behavior, but differs from Babel - /// in the following ways: - /// - /// 1. Does not check that name is a valid JS identifier name. - /// In most cases, we'll be creating a UID based on an existing variable name, in which case - /// this check is redundant. /// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start. /// The fact that a `_` will be prepended on start means providing an empty string or a string /// starting with a digit (0-9) is fine. /// - /// 2. Does not convert to camel case. - /// This seems unimportant. - /// - /// 3. Does not check var name against list of globals or "contextVariables" - /// (which Babel does in `hasBinding`). - /// No globals or "contextVariables" start with `_` anyway, so no need for this check. - /// - /// 4. Does not check this name is unique if used as a named statement label, only that it's unique - /// as an identifier. - /// If we need to generate unique labels for named statements, we should create a separate method - /// `generate_uid_label`. - /// - /// 5. Does not check against list of other UIDs that have been created. - /// `TraverseScoping::generate_uid` adds this name to symbols table, so when creating next UID, - /// this one will be found and avoided, like any other existing binding. So it's not needed. - /// - /// # Potential improvements - /// - /// TODO(improve-on-babel): - /// - /// This function is fairly expensive, because it aims to replicate Babel's output. - /// - /// `get_uid_names` iterates through every single binding and unresolved reference in the entire AST, - /// and builds a hashset of symbols which could clash with UIDs. - /// Once that's built, it's cached, but `find_uid_name` still has to do at least one hashset lookup, - /// and a hashset insert. If the first name tried is already in use, it will do another hashset lookup, - /// potentially multiple times until a name which isn't taken is found. - /// - /// We could improve this in one of 3 ways: - /// - /// 1. Build the hashset in `SemanticBuilder` instead of iterating through all symbols again here. - /// - /// 2. Use a much simpler method: - /// - /// * During initial semantic pass, check for any existing identifiers starting with `_`. - /// * Calculate what is the highest postfix number on `_...` identifiers (e.g. `_foo1`, `_bar8`). - /// * Store that highest number in a counter which is global across the whole program. - /// * When creating a UID, increment the counter, and make the UID `_`. - /// - /// i.e. if source contains identifiers `_foo1` and `_bar15`, create UIDs named `_qux16`, - /// `_temp17` etc. They'll all be unique within the program. - /// - /// Minimal cost in semantic, and generating UIDs extremely cheap. - /// - /// This is a slightly different method from Babel, and unfortunately produces UID names - /// which differ from Babel for some of its test cases. - /// - /// 3. If output is being minified anyway, use a method which produces less debuggable output, - /// but is even simpler: - /// - /// * During initial semantic pass, check for any existing identifiers starting with `_`. - /// * Find the highest number of leading `_`s for any existing symbol. - /// * Generate UIDs with a counter starting at 0, prefixed with number of `_`s one greater than - /// what was found in AST. - /// i.e. if source contains identifiers `_foo` and `__bar`, create UIDs names `___0`, `___1`, - /// `___2` etc. They'll all be unique within the program. - #[expect(clippy::missing_panics_doc)] + /// See comments on `UidGenerator` for further details. pub fn generate_uid_name(&mut self, name: &str, allocator: &'a Allocator) -> Atom<'a> { - // If `uid_names` is not already populated, initialize it - if self.uid_names.is_none() { - self.uid_names = Some(self.get_uid_names(allocator)); - } - let uid_names = self.uid_names.as_mut().unwrap(); - - let base = get_uid_name_base(name); - let uid = get_unique_name(base, uid_names, allocator).into_bump_str(); - uid_names.insert(uid); - Atom::from(uid) + // If `uid_generator` is not already populated, initialize it + let uid_generator = + self.uid_generator.get_or_insert_with(|| UidGenerator::new(&self.scoping, allocator)); + // Generate unique name + uid_generator.create(name) } /// Create a reference bound to a `SymbolId` @@ -432,12 +359,12 @@ impl<'a> TraverseScoping<'a> { } // Methods used internally within crate -impl<'a> TraverseScoping<'a> { +impl TraverseScoping<'_> { /// Create new `TraverseScoping` pub(super) fn new(scoping: Scoping) -> Self { Self { scoping, - uid_names: None, + uid_generator: None, // Dummy values. Both immediately overwritten in `walk_program`. current_scope_id: ScopeId::new(0), current_hoist_scope_id: ScopeId::new(0), @@ -468,285 +395,7 @@ impl<'a> TraverseScoping<'a> { self.current_block_scope_id = scope_id; } - /// Get `uid_names`. - /// - /// Iterate through all symbols and unresolved references in AST and identify any var names - /// which could clash with UIDs (start with `_`). Build a hash set containing them. - /// - /// Once this set is created, generating a UID is a relatively quick operation, rather than - /// iterating over all symbols and unresolved references every time generate a UID. - fn get_uid_names(&self, allocator: &'a Allocator) -> FxHashSet<&'a str> { - self.scoping - .root_unresolved_references() - .keys() - .copied() - .chain(self.scoping.symbol_names()) - .filter(|name| name.as_bytes().first() == Some(&b'_')) - .map(|str| allocator.alloc_str(str)) - .collect() - } - pub fn delete_typescript_bindings(&mut self) { self.scoping.delete_typescript_bindings(); } } - -/// Create base for UID name based on provided `name`. -/// Trim `_`s from start and digits from end. -/// i.e. `__foo123` -> `foo` -fn get_uid_name_base(name: &str) -> &str { - // Equivalent to `name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit())` - // but more efficient as operates on bytes not chars - let mut bytes = name.as_bytes(); - while bytes.first() == Some(&b'_') { - bytes = &bytes[1..]; - } - while matches!(bytes.last(), Some(b) if b.is_ascii_digit()) { - bytes = &bytes[0..bytes.len() - 1]; - } - // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters, - // so remainder must still be valid UTF8 - unsafe { str::from_utf8_unchecked(bytes) } -} - -// TODO: We could make this function more performant, especially when it checks a lot of names -// before it reaches one that is unused. -// This function repeatedly creates strings which have only differ from each other by digits added on end, -// and then hashes each of those strings to test them against the hash set `uid_names`. -// Hashing strings is fairly expensive. As here only the end of the string changes on each iteration, -// we could calculate an "unfinished" hash not including the last block, and then just add the final -// block to "finish" the hash on each iteration. With `FxHash` this would be straight line code and only -// a few operations. -fn get_unique_name<'a>( - base: &str, - uid_names: &FxHashSet<&'a str>, - allocator: &'a Allocator, -) -> ArenaString<'a> { - // Create `ArenaString` prepending name with `_`, and with 1 byte excess capacity. - // The extra byte is to avoid reallocation if need to add a digit on the end later, - // which will not be too uncommon. - // Having to add 2 digits will be uncommon, so we don't allocate 2 extra bytes for 2 digits. - let mut name = ArenaString::with_capacity_in(base.len() + 2, allocator); - name.push('_'); - name.push_str(base); - - // It's fairly common that UIDs may need a numerical postfix, so we try to keep string - // operations to a minimum for postfixes up to 99 - reusing a single `ArenaString`, - // rather than generating a new string on each attempt. - // For speed we manipulate the string as bytes. - // Postfixes greater than 99 should be very uncommon, so don't bother optimizing. - // - // SAFETY: Only modifications to string are replacing last byte/last 2 bytes with ASCII digits. - // These bytes are already ASCII chars, so cannot produce an invalid UTF-8 string. - // Writes are always in bounds (`bytes` is redefined after string grows due to `push`). - unsafe { - let name_is_unique = |bytes: &[u8]| { - let name = str::from_utf8_unchecked(bytes); - !uid_names.contains(name) - }; - - // Try the name without a numerical postfix (i.e. plain `_temp`) - let bytes = name.as_bytes_mut(); - if name_is_unique(bytes) { - return name; - } - - // Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`) - name.push('2'); - let bytes = name.as_bytes_mut(); - if name_is_unique(bytes) { - return name; - } - - let last_index = bytes.len() - 1; - for c in b'3'..=b'9' { - *bytes.get_unchecked_mut(last_index) = c; - if name_is_unique(bytes) { - return name; - } - } - - // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`) - *bytes.get_unchecked_mut(last_index) = b'1'; - name.push('0'); - let bytes = name.as_bytes_mut(); - let last_index = last_index + 1; - - let mut c1 = b'1'; - loop { - if name_is_unique(bytes) { - return name; - } - for c2 in b'1'..=b'9' { - *bytes.get_unchecked_mut(last_index) = c2; - if name_is_unique(bytes) { - return name; - } - } - if c1 == b'9' { - break; - } - c1 += 1; - - let last_two: &mut [u8; 2] = - bytes.get_unchecked_mut(last_index - 1..=last_index).try_into().unwrap(); - *last_two = [c1, b'0']; - } - } - - // Try longer postfixes (`_temp100` upwards) - - // Reserve space for 1 more byte for the additional 3rd digit. - // Do this here so that `name.push_str(digits)` will never need to grow the string until it reaches - // `n == 1000`, which makes the branch on "is there sufficient capacity to push?" in the loop below - // completely predictable for `n < 1000`. - name.reserve(1); - - // At this point, `name` has had 2 digits added on end. `base_len` is length without those 2 digits. - let base_len = name.len() - 2; - - let mut buffer = ItoaBuffer::new(); - for n in 100..=u32::MAX { - let digits = buffer.format(n); - /* - // SAFETY: `base_len` is always shorter than current `name.len()`, on a UTF-8 char boundary, - // and `name` contains at least `base_len` initialized bytes - unsafe { name.set_len(base_len) }; - */ - // workaround for `set_len` does not exist in `ArenaString` - name.truncate(base_len); - name.push_str(digits); - if !uid_names.contains(name.as_str()) { - return name; - } - } - - // Limit for size of source text is `u32::MAX` bytes, so there cannot be `u32::MAX` - // identifier names in the AST. So loop above cannot fail to find an unused name. - unreachable!(); -} - -#[cfg(test)] -#[test] -fn test_get_unique_name() { - let cases: &[(&[&str], &str, &str)] = &[ - (&[], "foo", "_foo"), - (&["_foo"], "foo", "_foo2"), - (&["_foo0", "_foo1"], "foo", "_foo"), - (&["_foo2", "_foo3", "_foo4"], "foo", "_foo"), - (&["_foo", "_foo2"], "foo", "_foo3"), - (&["_foo", "_foo2", "_foo4"], "foo", "_foo3"), - (&["_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8"], "foo", "_foo9"), - ( - &["_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9"], - "foo", - "_foo10", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", - ], - "foo", - "_foo11", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", - ], - "foo", - "_foo12", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", - ], - "foo", - "_foo19", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", "_foo19", - ], - "foo", - "_foo20", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", "_foo19", "_foo20", - ], - "foo", - "_foo21", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25", - "_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33", - "_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41", - "_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49", - "_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57", - "_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65", - "_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73", - "_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81", - "_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89", - "_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97", - "_foo98", - ], - "foo", - "_foo99", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25", - "_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33", - "_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41", - "_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49", - "_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57", - "_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65", - "_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73", - "_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81", - "_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89", - "_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97", - "_foo98", "_foo99", - ], - "foo", - "_foo100", - ), - ( - &[ - "_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9", - "_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17", - "_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25", - "_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33", - "_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41", - "_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49", - "_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57", - "_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65", - "_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73", - "_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81", - "_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89", - "_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97", - "_foo98", "_foo99", "_foo100", - ], - "foo", - "_foo101", - ), - ]; - - let allocator = Allocator::default(); - for (used, name, expected) in cases { - let used = used.iter().copied().collect::>(); - assert_eq!(get_unique_name(name, &used, &allocator), *expected); - } -} diff --git a/crates/oxc_traverse/src/context/uid.rs b/crates/oxc_traverse/src/context/uid.rs new file mode 100644 index 0000000000000..58fb968ede694 --- /dev/null +++ b/crates/oxc_traverse/src/context/uid.rs @@ -0,0 +1,376 @@ +use std::{iter, str}; + +use itoa::Buffer as ItoaBuffer; +use rustc_hash::FxHashMap; + +use oxc_allocator::{Allocator, String as ArenaString}; +use oxc_semantic::Scoping; +use oxc_span::Atom; + +/// Unique identifier generator. +/// +/// When initialized with [`UidGenerator::new`], creates a catalog of all symbols and unresolved references +/// in the AST which begin with `_`. +/// +/// [`UidGenerator::create`] uses that catalog to generate a unique identifier which does not clash with +/// any existing name. +/// +/// Such UIDs are based on the base name provided. They start with `_` and end with digits if required to +/// maintain uniqueness. e.g. given base name of `foo`, UIDs will be `_foo`, `_foo2`, `_foo3` etc. +/// +/// Roughly based on Babel's `scope.generateUid` logic, but with some differences (see below). +/// +/// +/// # Algorithm +/// +/// UIDs are generated in series for each "base" name. +/// "Base" name is the provided name with `_`s trimmed from the start, and digits trimmed from the end. +/// +/// During cataloging of existing symbols, for each base name it's recorded: +/// +/// 1. Largest number of leading `_`s. +/// 2. Largest numeric postfix for that base name. +/// +/// UIDs are generated for that base name with that number of leading underscores, and with ascending +/// numeric postfix. +/// +/// | Existing symbols | Generated UIDs | +/// |------------------|---------------------------------| +/// | (none) | `_foo`, `_foo2`, `_foo3` | +/// | `_foo` | `_foo2`, `_foo3`, `_foo4` | +/// | `_foo3` | `_foo4`, `_foo5`, `_foo6` | +/// | `__foo` | `__foo2`, `__foo3`, `__foo4` | +/// | `___foo5` | `___foo6`, `___foo7`, `___foo8` | +/// | `_foo8`, `__foo` | `__foo2`, `__foo3`, `__foo4` | +/// +/// This algorithm requires at most 1 hashmap lookup and 1 hashmap insert per UID generated. +/// +/// # Differences from Babel +/// +/// This implementation aims to replicate Babel's behavior, but differs from Babel +/// in the following ways: +/// +/// 1. Does not check that name provided as "base" for the UID is a valid JS identifier name. +/// In most cases, we're creating a UID based on an existing variable name, in which case +/// this check is redundant. +/// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start. +/// The fact that a `_` will be prepended on start means providing an empty string or a string +/// starting with a digit (0-9) is fine. +/// +/// 2. Does not convert to camel case. +/// This seems unimportant. +/// +/// 3. Does not check var name against list of globals or "contextVariables" +/// (which Babel does in `hasBinding`). +/// No globals or "contextVariables" start with `_` anyway, so no need for this check. +/// +/// 4. Does not check this name is unique if used as a named statement label, +/// only that it's unique as an identifier. +/// +/// 5. Uses a slightly different algorithm for generating names (see above). +/// The resulting UIDs are similar enough to Babel's algorithm to fail only 1 of Babel's tests. +/// +/// # Potential improvements +/// +/// TODO(improve-on-babel): +/// +/// UID generation is fairly expensive, because of the amount of string hashing required. +/// +/// [`UidGenerator::new`] iterates through every binding and unresolved reference in the entire AST, +/// and builds a hashmap of symbols which could clash with UIDs. +/// Once that's built, [`UidGenerator::create`] has to do at a hashmap lookup when generating each UID. +/// Hashing strings is a fairly expensive operation. +/// +/// We could improve this in one of 3 ways: +/// +/// ## 1. Build the hashmap in `SemanticBuilder` +/// +/// Instead of iterating through all symbols again here. +/// +/// ## 2. Use a simpler algorithm +/// +/// * During initial semantic pass, check for any existing identifiers starting with `_`. +/// * Calculate what is the highest postfix number on `_...` identifiers (e.g. `_foo1`, `_bar8`). +/// * Store that highest number in a counter which is global across the whole program. +/// * When creating a UID, increment the counter, and make the UID `_`. +/// +/// i.e. if source contains identifiers `_foo1` and `_bar15`, create UIDs named `_qux16`, +/// `_temp17` etc. They'll all be unique within the program. +/// +/// Minimal cost in semantic, and generating UIDs extremely cheap. +/// +/// The resulting UIDs would still be fairly readable. +/// +/// This is a different method from Babel, and unfortunately produces UID names +/// which differ from Babel for some of its test cases. +/// +/// ## 3. Even simpler algorithm, but produces hard-to-read code +/// +/// If output is being minified anyway, use a method which produces less debuggable output, +/// but is even simpler: +/// +/// * During initial semantic pass, check for any existing identifiers starting with `_`. +/// * Find the highest number of leading `_`s for any existing symbol. +/// * Generate UIDs with a counter starting at 0, prefixed with number of `_`s one greater than +/// what was found in AST. +/// +/// i.e. if source contains identifiers `_foo` and `__bar`, create UIDs names `___0`, `___1`, +/// `___2` etc. They'll all be unique within the program. +pub struct UidGenerator<'a> { + names: FxHashMap<&'a str, UidName>, + allocator: &'a Allocator, +} + +/// Details of next UID for a base name. +// +// `#[repr(align(8))]` on 64-bit platforms so can fit in a single register. +#[cfg_attr(target_pointer_width = "64", repr(align(8)))] +#[derive(Clone, Copy)] +struct UidName { + /// Digits appended to end of name. + /// When generating a UID, increment this field and use that as the postfix. + /// This field is never 0, so postfix will be at least 2. + postfix: u32, + /// Number of underscores to prefix name with. + underscore_count: u32, +} + +impl<'a> UidGenerator<'a> { + /// Create [`UidGenerator`]. + pub(super) fn new(scoping: &Scoping, allocator: &'a Allocator) -> Self { + let mut generator = Self { names: FxHashMap::default(), allocator }; + + for name in scoping.symbol_names() { + generator.add(name); + } + for &name in scoping.root_unresolved_references().keys() { + generator.add(name); + } + + generator + } + + /// Add a record to [`UidGenerator`]. + fn add(&mut self, name: &str) { + // If `name` does not start with `_`, exit + if name.as_bytes().first() != Some(&b'_') { + return; + } + + // Trim off underscores from start of `name` + let original_len = name.len(); + // SAFETY: We just check first byte of `name` is `_` + let name = unsafe { name.get_unchecked(1..) }; + let mut name = name.trim_start_matches('_'); + #[expect(clippy::cast_possible_truncation)] + let underscore_count = (original_len - name.len()) as u32; + let mut uid_name = UidName { underscore_count, postfix: 1 }; + + // Find digits on end of `name` + let last_non_digit_index = name.as_bytes().iter().rposition(|&b| !b.is_ascii_digit()); + let parts = match last_non_digit_index { + Some(last_non_digit_index) => { + if last_non_digit_index == name.len() - 1 { + // No digits on end + None + } else { + // Name ends with digits + let digit_index = last_non_digit_index + 1; + debug_assert!(name.as_bytes().get(digit_index).is_some_and(u8::is_ascii_digit)); + // SAFETY: There's an ASCII digit at `digit_index`, so slicing `name` at that index + // is guaranteed to yield 2 valid UTF-8 strings. `digit_index` cannot be out of bounds. + unsafe { + let without_digits = name.get_unchecked(..digit_index); + let digits = name.get_unchecked(digit_index..); + Some((without_digits, digits)) + } + } + } + None => { + if name.is_empty() { + // Name consists purely of `_`s e.g. `_` or `___` + None + } else { + // Name consists of `_`s followed by digits e.g. `_123` + Some(("", name)) + } + } + }; + + if let Some((without_digits, digits)) = parts { + const U32_MAX_LEN: usize = "4294967295".len(); // 4294967295 = u32::MAX + // SAFETY: `digits` cannot be empty + let first_digit = unsafe { *digits.as_bytes().get_unchecked(0) }; + if first_digit == b'0' || digits.len() > U32_MAX_LEN { + // We don't create UIDs with postfix starting with 0, or greater than `u32::MAX`, + // so can ignore this - can't clash + return; + } + if let Ok(n) = digits.parse::() { + if n == 1 { + // We don't create UIDs with postfix of 1, so can ignore this - can't clash + return; + } + name = without_digits; + uid_name.postfix = n; + } else { + // Digits represent a number greater than `u32::MAX`. + // We don't create UIDs with postfix over `u32::MAX` so can ignore this - can't clash. + return; + } + } + + // Unfortunately can't use `Entry` API here because `name` doesn't have required lifetime `'a`, + // because it comes from `Semantic`'s arena, not the AST arena + if let Some(existing_uid_name) = self.names.get_mut(name) { + if uid_name.underscore_count > existing_uid_name.underscore_count + || (uid_name.underscore_count == existing_uid_name.underscore_count + && uid_name.postfix > existing_uid_name.postfix) + { + *existing_uid_name = uid_name; + } + } else { + let name = self.allocator.alloc_str(name); + self.names.insert(name, uid_name); + } + } + + /// Create a unique identifier. + /// + /// The UID returned will be added to the list of used identifiers, so this method will never + /// return the same UID twice. + /// + /// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start. + /// The fact that a `_` will be prepended on start means providing an empty string or a string + /// starting with a digit (0-9) is fine. + /// + /// Please see docs for [`UidGenerator`] for further info. + pub(super) fn create(&mut self, name: &str) -> Atom<'a> { + // Get the base name, with `_`s trimmed from start, and digits trimmed from end. + // i.e. `__foo123` -> `foo`. + // Equivalent to `name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit())` + // but more efficient as operates on bytes not chars + let mut bytes = name.as_bytes(); + while bytes.first() == Some(&b'_') { + bytes = &bytes[1..]; + } + while matches!(bytes.last(), Some(b) if b.is_ascii_digit()) { + bytes = &bytes[0..bytes.len() - 1]; + } + // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters, + // so remainder must still be valid UTF8 + let base = unsafe { str::from_utf8_unchecked(bytes) }; + + // Generate UID. + // Unfortunately can't use `Entry` API here as `name` doesn't have required lifetime `'a`. + if let Some(uid_name) = self.names.get_mut(base) { + // AST contains identifier(s) with this base already. + // Get next postfix. + if uid_name.postfix < u32::MAX { + // Increment `postfix` + uid_name.postfix += 1; + } else { + // Identifier `_4294967295` was already used. + // Can't increment `postfix` as it would wrap around, so increment `underscore_count` instead. + // It shouldn't be possible for `underscore_count` to be `u32::MAX` too, because that + // would require an identifier comprising `u32::MAX` x underscores in source text. + // That's theoretically possible, but source text is limited to `u32::MAX` bytes, + // so it'd be the entirety of the source text. Therefore `postfix` would be 1. + uid_name.underscore_count += 1; + uid_name.postfix = 2; + } + + // Format UID `_`. + // If `underscore_count > 1`, add further underscores to the start. + let mut buffer = ItoaBuffer::new(); + let digits = buffer.format(uid_name.postfix); + + let uid = if uid_name.underscore_count == 1 { + ArenaString::from_strs_array_in(["_", base, digits], self.allocator) + } else { + let mut uid = ArenaString::with_capacity_in( + uid_name.underscore_count as usize + base.len() + digits.len(), + self.allocator, + ); + uid.extend(iter::repeat_n("_", uid_name.underscore_count as usize)); + uid.push_str(base); + uid.push_str(digits); + uid + }; + + Atom::from(uid) + } else { + let uid = Atom::from(ArenaString::from_strs_array_in(["_", base], self.allocator)); + // SAFETY: String starts with `_`, so trimming off that byte leaves a valid UTF-8 string + let base = unsafe { uid.as_str().get_unchecked(1..) }; + self.names.insert(base, UidName { underscore_count: 1, postfix: 1 }); + uid + } + } +} + +#[cfg(test)] +#[test] +fn uids() { + // (&[ initial, ... ], &[ (name, expected_uid), ... ]) + #[expect(clippy::type_complexity)] + let cases: &[(&[&str], &[(&str, &str)])] = &[ + (&[], &[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")]), + ( + &["foo", "foo0", "foo1", "foo2", "foo10", "_bar"], + &[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")], + ), + ( + &["_foo0", "_foo1", "__foo0", "____foo1", "_foo01", "_foo012345", "_foo000000"], + &[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")], + ), + (&[], &[("_foo", "_foo"), ("__foo", "_foo2"), ("_____foo", "_foo3")]), + (&[], &[("_foo123", "_foo"), ("__foo456", "_foo2"), ("_____foo789", "_foo3")]), + (&["_foo"], &[("foo", "_foo2"), ("foo", "_foo3"), ("foo", "_foo4")]), + (&["_foo3"], &[("foo", "_foo4"), ("foo", "_foo5"), ("foo", "_foo6")]), + (&["__foo"], &[("foo", "__foo2"), ("foo", "__foo3"), ("foo", "__foo4")]), + (&["__foo8"], &[("foo", "__foo9"), ("foo", "__foo10"), ("foo", "__foo11")]), + (&["_foo999", "____foo"], &[("foo", "____foo2"), ("foo", "____foo3"), ("foo", "____foo4")]), + ( + &["_foo4294967293"], + &[ + ("foo", "_foo4294967294"), + ("foo", "_foo4294967295"), + ("foo", "__foo2"), + ("foo", "__foo3"), + ], + ), + ( + &["___foo4294967293"], + &[ + ("foo", "___foo4294967294"), + ("foo", "___foo4294967295"), + ("foo", "____foo2"), + ("foo", "____foo3"), + ], + ), + (&[], &[("_", "_"), ("_", "_2"), ("_", "_3")]), + ( + &["_0", "_1", "__0", "____1", "_01", "_012345", "_000000"], + &[("_", "_"), ("_", "_2"), ("_", "_3")], + ), + (&[], &[("___", "_"), ("_____", "_2"), ("_____", "_3")]), + (&["_"], &[("_", "_2"), ("_", "_3"), ("_", "_4")]), + (&["_4"], &[("_", "_5"), ("_", "_6"), ("_", "_7")]), + (&["___"], &[("_", "___2"), ("_", "___3"), ("_", "___4")]), + (&["___99"], &[("_", "___100"), ("_", "___101"), ("_", "___102")]), + (&["_"], &[("_123", "_2"), ("__456", "_3"), ("___789", "_4")]), + ]; + + let allocator = Allocator::default(); + for &(used_names, created) in cases { + let mut generator = UidGenerator { names: FxHashMap::default(), allocator: &allocator }; + for &used_name in used_names { + generator.add(used_name); + } + + for &(name, uid) in created { + assert_eq!(generator.create(name), uid); + } + } +} diff --git a/tasks/coverage/snapshots/semantic_typescript.snap b/tasks/coverage/snapshots/semantic_typescript.snap index e8354b819e549..f0177b5c72ea4 100644 --- a/tasks/coverage/snapshots/semantic_typescript.snap +++ b/tasks/coverage/snapshots/semantic_typescript.snap @@ -10133,8 +10133,8 @@ rebuilt : ["Array", "Function", "MyClassDecorator", "MyMethodDecorator", semantic Error: tasks/coverage/typescript/tests/cases/compiler/emitHelpersWithLocalCollisions.ts Bindings mismatch: -after transform: ScopeId(0): ["A", "__decorate", "_decorate", "_objectSpread", "dec", "o", "y"] -rebuilt : ScopeId(0): ["A", "_decorate", "_objectSpread", "o", "y"] +after transform: ScopeId(0): ["A", "__decorate", "__decorate2", "_objectSpread", "dec", "o", "y"] +rebuilt : ScopeId(0): ["A", "__decorate2", "_objectSpread", "o", "y"] Symbol span mismatch for "A": after transform: SymbolId(2): Span { start: 57, end: 58 } rebuilt : SymbolId(2): Span { start: 0, end: 0 } @@ -16039,8 +16039,8 @@ rebuilt : ["React", "o"] semantic Error: tasks/coverage/typescript/tests/cases/compiler/importHelpersWithLocalCollisions.ts Bindings mismatch: -after transform: ScopeId(0): ["A", "__decorate", "_decorate", "_objectSpread", "dec", "o", "y"] -rebuilt : ScopeId(0): ["A", "_decorate", "_objectSpread", "o", "y"] +after transform: ScopeId(0): ["A", "__decorate", "__decorate2", "_objectSpread", "dec", "o", "y"] +rebuilt : ScopeId(0): ["A", "__decorate2", "_objectSpread", "o", "y"] Symbol span mismatch for "A": after transform: SymbolId(2): Span { start: 57, end: 58 } rebuilt : SymbolId(2): Span { start: 0, end: 0 } diff --git a/tasks/transform_conformance/overrides/babel-plugin-transform-typescript/test/fixtures/namespace/same-name/output.mjs b/tasks/transform_conformance/overrides/babel-plugin-transform-typescript/test/fixtures/namespace/same-name/output.mjs new file mode 100644 index 0000000000000..2e7e38b38723f --- /dev/null +++ b/tasks/transform_conformance/overrides/babel-plugin-transform-typescript/test/fixtures/namespace/same-name/output.mjs @@ -0,0 +1,22 @@ +let N; +(function (_N8) { + let _N7; + (function (_N9) { + var x; + })(_N7 || (_N7 = {})); + let N; + (function (_N10) { + function _N3() {} + _N10._N3 = _N3; + })(N || (N = _N8.N || (_N8.N = {}))); + (function (_N11) { + class _N5 {} + _N11._N5 = _N5; + })(N || (N = _N8.N || (_N8.N = {}))); + (function (_N12) { + let _N = /*#__PURE__*/function (_N) { + return _N; + }({}); + _N12._N = _N; + })(N || (N = _N8.N || (_N8.N = {}))); +})(N || (N = {})); diff --git a/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-identifier/output.js b/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-identifier/output.js index c3cfebc12a7dd..a8d8a7c6ec69b 100644 --- a/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-identifier/output.js +++ b/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-identifier/output.js @@ -1,8 +1,8 @@ -var _y, _z, _q, _unbound; +var _y, _z, _q, ___unbound2; let x, fn, ___bound; x = Math.pow(x, 2); _y = y, y = Math.pow(_y, 3); _z = z, z = Math.pow(_z, fn()); _q = q, q = Math.pow(_q, unboundFn()); ___bound = Math.pow(___bound, 4); -_unbound = ___unbound, ___unbound = Math.pow(_unbound, 5) +___unbound2 = ___unbound, ___unbound = Math.pow(___unbound2, 5) diff --git a/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-member-expression/output.js b/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-member-expression/output.js index 23f8fc62336b7..303abe4c51517 100644 --- a/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-member-expression/output.js +++ b/tasks/transform_conformance/tests/babel-plugin-transform-exponentiation-operator/test/fixtures/assign-to-member-expression/output.js @@ -1,4 +1,4 @@ -var _obj$foo$bar, _boundPropName, _unboundPropName, _obj$foo2$bar, _boundPropName2, _obj$foo3$bar, _unboundPropName2, _boundPropObj$foo$bar, _unboundPropObj$foo$b, _unboundObj, _unboundObj2, _unboundObj$foo$bar, _unboundObj3, _boundPropName3, _unboundObj4, _unboundPropName3, _unboundObj$foo2$bar, _boundPropName4, _unboundObj$foo3$bar, _unboundPropName4, _unboundObj5, _boundPropObj2$foo$ba, _unboundObj6, _unboundPropObj2$foo$, _fn, _fn$foo$bar, _fn$prop, _fn2, _fn$prop2, _ref, _this, _this$foo$bar, _this2, _this3, _fn4$foo$bar$qux, _unbound, _bound, _unbound2; +var _obj$foo$bar, _boundPropName, _unboundPropName, _obj$foo2$bar, _boundPropName2, _obj$foo3$bar, _unboundPropName2, _boundPropObj$foo$bar, _unboundPropObj$foo$b, _unboundObj, _unboundObj2, _unboundObj$foo$bar, _unboundObj3, _boundPropName3, _unboundObj4, _unboundPropName3, _unboundObj$foo2$bar, _boundPropName4, _unboundObj$foo3$bar, _unboundPropName4, _unboundObj5, _boundPropObj2$foo$ba, _unboundObj6, _unboundPropObj2$foo$, _fn, _fn$foo$bar, _fn$prop, _fn2, _fn$prop2, _ref, _this, _this$foo$bar, _this2, _this3, _fn4$foo$bar$qux, ___unbound2, ___bound2, ___unbound3; // Bound root of member expression let obj; @@ -109,6 +109,6 @@ function outer() { // Underscore var names let ___bound; ___bound["prop"] = Math.pow(___bound["prop"], 32); -(_unbound = ___unbound), (_unbound["prop"] = Math.pow(_unbound["prop"], 33)); -(_bound = ___bound), (obj[_bound] = Math.pow(obj[_bound], 34)); -(_unbound2 = ___unbound), (obj[_unbound2] = Math.pow(obj[_unbound2], 35)); +(___unbound2 = ___unbound), (___unbound2["prop"] = Math.pow(___unbound2["prop"], 33)); +(___bound2 = ___bound), (obj[___bound2] = Math.pow(obj[___bound2], 34)); +(___unbound3 = ___unbound), (obj[___unbound3] = Math.pow(obj[___unbound3], 35));