Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/oxc_traverse/src/context/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ mod bound_identifier;
mod maybe_bound_identifier;
mod reusable;
mod scoping;
mod uid;
use ancestry::PopToken;
pub use ancestry::TraverseAncestry;
pub use bound_identifier::BoundIdentifier;
Expand Down
375 changes: 12 additions & 363 deletions crates/oxc_traverse/src/context/scoping.rs

Large diffs are not rendered by default.

376 changes: 376 additions & 0 deletions crates/oxc_traverse/src/context/uid.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
use std::{iter, str};

use itoa::Buffer as ItoaBuffer;
use rustc_hash::FxHashMap;

use oxc_allocator::{Allocator, String as ArenaString};
use oxc_semantic::Scoping;
use oxc_span::Atom;

/// Unique identifier generator.
///
/// When initialized with [`UidGenerator::new`], creates a catalog of all symbols and unresolved references
/// in the AST which begin with `_`.
///
/// [`UidGenerator::create`] uses that catalog to generate a unique identifier which does not clash with
/// any existing name.
///
/// Such UIDs are based on the base name provided. They start with `_` and end with digits if required to
/// maintain uniqueness. e.g. given base name of `foo`, UIDs will be `_foo`, `_foo2`, `_foo3` etc.
///
/// Roughly based on Babel's `scope.generateUid` logic, but with some differences (see below).
/// <https://github.com/babel/babel/blob/3b1a3c0be9df65140260a316c1a21adcf948645d/packages/babel-traverse/src/scope/index.ts#L501-L523>
///
/// # Algorithm
///
/// UIDs are generated in series for each "base" name.
/// "Base" name is the provided name with `_`s trimmed from the start, and digits trimmed from the end.
///
/// During cataloging of existing symbols, for each base name it's recorded:
///
/// 1. Largest number of leading `_`s.
/// 2. Largest numeric postfix for that base name.
///
/// UIDs are generated for that base name with that number of leading underscores, and with ascending
/// numeric postfix.
///
/// | Existing symbols | Generated UIDs |
/// |------------------|---------------------------------|
/// | (none) | `_foo`, `_foo2`, `_foo3` |
/// | `_foo` | `_foo2`, `_foo3`, `_foo4` |
/// | `_foo3` | `_foo4`, `_foo5`, `_foo6` |
/// | `__foo` | `__foo2`, `__foo3`, `__foo4` |
/// | `___foo5` | `___foo6`, `___foo7`, `___foo8` |
/// | `_foo8`, `__foo` | `__foo2`, `__foo3`, `__foo4` |
///
/// This algorithm requires at most 1 hashmap lookup and 1 hashmap insert per UID generated.
///
/// # Differences from Babel
///
/// This implementation aims to replicate Babel's behavior, but differs from Babel
/// in the following ways:
///
/// 1. Does not check that name provided as "base" for the UID is a valid JS identifier name.
/// In most cases, we're creating a UID based on an existing variable name, in which case
/// this check is redundant.
/// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start.
/// The fact that a `_` will be prepended on start means providing an empty string or a string
/// starting with a digit (0-9) is fine.
///
/// 2. Does not convert to camel case.
/// This seems unimportant.
///
/// 3. Does not check var name against list of globals or "contextVariables"
/// (which Babel does in `hasBinding`).
/// No globals or "contextVariables" start with `_` anyway, so no need for this check.
///
/// 4. Does not check this name is unique if used as a named statement label,
/// only that it's unique as an identifier.
///
/// 5. Uses a slightly different algorithm for generating names (see above).
/// The resulting UIDs are similar enough to Babel's algorithm to fail only 1 of Babel's tests.
///
/// # Potential improvements
///
/// TODO(improve-on-babel):
///
/// UID generation is fairly expensive, because of the amount of string hashing required.
///
/// [`UidGenerator::new`] iterates through every binding and unresolved reference in the entire AST,
/// and builds a hashmap of symbols which could clash with UIDs.
/// Once that's built, [`UidGenerator::create`] has to do at a hashmap lookup when generating each UID.
/// Hashing strings is a fairly expensive operation.
///
/// We could improve this in one of 3 ways:
///
/// ## 1. Build the hashmap in `SemanticBuilder`
///
/// Instead of iterating through all symbols again here.
///
/// ## 2. Use a simpler algorithm
///
/// * During initial semantic pass, check for any existing identifiers starting with `_`.
/// * Calculate what is the highest postfix number on `_...` identifiers (e.g. `_foo1`, `_bar8`).
/// * Store that highest number in a counter which is global across the whole program.
/// * When creating a UID, increment the counter, and make the UID `_<name><counter>`.
///
/// i.e. if source contains identifiers `_foo1` and `_bar15`, create UIDs named `_qux16`,
/// `_temp17` etc. They'll all be unique within the program.
///
/// Minimal cost in semantic, and generating UIDs extremely cheap.
///
/// The resulting UIDs would still be fairly readable.
///
/// This is a different method from Babel, and unfortunately produces UID names
/// which differ from Babel for some of its test cases.
///
/// ## 3. Even simpler algorithm, but produces hard-to-read code
///
/// If output is being minified anyway, use a method which produces less debuggable output,
/// but is even simpler:
///
/// * During initial semantic pass, check for any existing identifiers starting with `_`.
/// * Find the highest number of leading `_`s for any existing symbol.
/// * Generate UIDs with a counter starting at 0, prefixed with number of `_`s one greater than
/// what was found in AST.
///
/// i.e. if source contains identifiers `_foo` and `__bar`, create UIDs names `___0`, `___1`,
/// `___2` etc. They'll all be unique within the program.
pub struct UidGenerator<'a> {
names: FxHashMap<&'a str, UidName>,
allocator: &'a Allocator,
}

/// Details of next UID for a base name.
//
// `#[repr(align(8))]` on 64-bit platforms so can fit in a single register.
#[cfg_attr(target_pointer_width = "64", repr(align(8)))]
#[derive(Clone, Copy)]
struct UidName {
/// Digits appended to end of name.
/// When generating a UID, increment this field and use that as the postfix.
/// This field is never 0, so postfix will be at least 2.
postfix: u32,
/// Number of underscores to prefix name with.
underscore_count: u32,
}

impl<'a> UidGenerator<'a> {
/// Create [`UidGenerator`].
pub(super) fn new(scoping: &Scoping, allocator: &'a Allocator) -> Self {
let mut generator = Self { names: FxHashMap::default(), allocator };

for name in scoping.symbol_names() {
generator.add(name);
}
for &name in scoping.root_unresolved_references().keys() {
generator.add(name);
}

generator
}

/// Add a record to [`UidGenerator`].
fn add(&mut self, name: &str) {
// If `name` does not start with `_`, exit
if name.as_bytes().first() != Some(&b'_') {
return;
}

// Trim off underscores from start of `name`
let original_len = name.len();
// SAFETY: We just check first byte of `name` is `_`
let name = unsafe { name.get_unchecked(1..) };
let mut name = name.trim_start_matches('_');
#[expect(clippy::cast_possible_truncation)]
let underscore_count = (original_len - name.len()) as u32;
let mut uid_name = UidName { underscore_count, postfix: 1 };

// Find digits on end of `name`
let last_non_digit_index = name.as_bytes().iter().rposition(|&b| !b.is_ascii_digit());
let parts = match last_non_digit_index {
Some(last_non_digit_index) => {
if last_non_digit_index == name.len() - 1 {
// No digits on end
None
} else {
// Name ends with digits
let digit_index = last_non_digit_index + 1;
debug_assert!(name.as_bytes().get(digit_index).is_some_and(u8::is_ascii_digit));
// SAFETY: There's an ASCII digit at `digit_index`, so slicing `name` at that index
// is guaranteed to yield 2 valid UTF-8 strings. `digit_index` cannot be out of bounds.
unsafe {
let without_digits = name.get_unchecked(..digit_index);
let digits = name.get_unchecked(digit_index..);
Some((without_digits, digits))
}
}
}
None => {
if name.is_empty() {
// Name consists purely of `_`s e.g. `_` or `___`
None
} else {
// Name consists of `_`s followed by digits e.g. `_123`
Some(("", name))
}
}
};

if let Some((without_digits, digits)) = parts {
const U32_MAX_LEN: usize = "4294967295".len(); // 4294967295 = u32::MAX
// SAFETY: `digits` cannot be empty
let first_digit = unsafe { *digits.as_bytes().get_unchecked(0) };
if first_digit == b'0' || digits.len() > U32_MAX_LEN {
// We don't create UIDs with postfix starting with 0, or greater than `u32::MAX`,
// so can ignore this - can't clash
return;
}
if let Ok(n) = digits.parse::<u32>() {
if n == 1 {
// We don't create UIDs with postfix of 1, so can ignore this - can't clash
return;
}
name = without_digits;
uid_name.postfix = n;
} else {
// Digits represent a number greater than `u32::MAX`.
// We don't create UIDs with postfix over `u32::MAX` so can ignore this - can't clash.
return;
}
}

// Unfortunately can't use `Entry` API here because `name` doesn't have required lifetime `'a`,
// because it comes from `Semantic`'s arena, not the AST arena
if let Some(existing_uid_name) = self.names.get_mut(name) {
if uid_name.underscore_count > existing_uid_name.underscore_count
|| (uid_name.underscore_count == existing_uid_name.underscore_count
&& uid_name.postfix > existing_uid_name.postfix)
{
*existing_uid_name = uid_name;
}
} else {
let name = self.allocator.alloc_str(name);
self.names.insert(name, uid_name);
}
}

/// Create a unique identifier.
///
/// The UID returned will be added to the list of used identifiers, so this method will never
/// return the same UID twice.
///
/// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start.
/// The fact that a `_` will be prepended on start means providing an empty string or a string
/// starting with a digit (0-9) is fine.
///
/// Please see docs for [`UidGenerator`] for further info.
pub(super) fn create(&mut self, name: &str) -> Atom<'a> {
// Get the base name, with `_`s trimmed from start, and digits trimmed from end.
// i.e. `__foo123` -> `foo`.
// Equivalent to `name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit())`
// but more efficient as operates on bytes not chars
let mut bytes = name.as_bytes();
while bytes.first() == Some(&b'_') {
bytes = &bytes[1..];
}
while matches!(bytes.last(), Some(b) if b.is_ascii_digit()) {
bytes = &bytes[0..bytes.len() - 1];
}
// SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters,
// so remainder must still be valid UTF8
let base = unsafe { str::from_utf8_unchecked(bytes) };

// Generate UID.
// Unfortunately can't use `Entry` API here as `name` doesn't have required lifetime `'a`.
if let Some(uid_name) = self.names.get_mut(base) {
// AST contains identifier(s) with this base already.
// Get next postfix.
if uid_name.postfix < u32::MAX {
// Increment `postfix`
uid_name.postfix += 1;
} else {
// Identifier `_<base>4294967295` was already used.
// Can't increment `postfix` as it would wrap around, so increment `underscore_count` instead.
// It shouldn't be possible for `underscore_count` to be `u32::MAX` too, because that
// would require an identifier comprising `u32::MAX` x underscores in source text.
// That's theoretically possible, but source text is limited to `u32::MAX` bytes,
// so it'd be the entirety of the source text. Therefore `postfix` would be 1.
uid_name.underscore_count += 1;
uid_name.postfix = 2;
}

// Format UID `_<base><postfix>`.
// If `underscore_count > 1`, add further underscores to the start.
let mut buffer = ItoaBuffer::new();
let digits = buffer.format(uid_name.postfix);

let uid = if uid_name.underscore_count == 1 {
ArenaString::from_strs_array_in(["_", base, digits], self.allocator)
} else {
let mut uid = ArenaString::with_capacity_in(
uid_name.underscore_count as usize + base.len() + digits.len(),
self.allocator,
);
uid.extend(iter::repeat_n("_", uid_name.underscore_count as usize));
uid.push_str(base);
uid.push_str(digits);
uid
};

Atom::from(uid)
} else {
let uid = Atom::from(ArenaString::from_strs_array_in(["_", base], self.allocator));
// SAFETY: String starts with `_`, so trimming off that byte leaves a valid UTF-8 string
let base = unsafe { uid.as_str().get_unchecked(1..) };
self.names.insert(base, UidName { underscore_count: 1, postfix: 1 });
uid
}
}
}

#[cfg(test)]
#[test]
fn uids() {
// (&[ initial, ... ], &[ (name, expected_uid), ... ])
#[expect(clippy::type_complexity)]
let cases: &[(&[&str], &[(&str, &str)])] = &[
(&[], &[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")]),
(
&["foo", "foo0", "foo1", "foo2", "foo10", "_bar"],
&[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")],
),
(
&["_foo0", "_foo1", "__foo0", "____foo1", "_foo01", "_foo012345", "_foo000000"],
&[("foo", "_foo"), ("foo", "_foo2"), ("foo", "_foo3")],
),
(&[], &[("_foo", "_foo"), ("__foo", "_foo2"), ("_____foo", "_foo3")]),
(&[], &[("_foo123", "_foo"), ("__foo456", "_foo2"), ("_____foo789", "_foo3")]),
(&["_foo"], &[("foo", "_foo2"), ("foo", "_foo3"), ("foo", "_foo4")]),
(&["_foo3"], &[("foo", "_foo4"), ("foo", "_foo5"), ("foo", "_foo6")]),
(&["__foo"], &[("foo", "__foo2"), ("foo", "__foo3"), ("foo", "__foo4")]),
(&["__foo8"], &[("foo", "__foo9"), ("foo", "__foo10"), ("foo", "__foo11")]),
(&["_foo999", "____foo"], &[("foo", "____foo2"), ("foo", "____foo3"), ("foo", "____foo4")]),
(
&["_foo4294967293"],
&[
("foo", "_foo4294967294"),
("foo", "_foo4294967295"),
("foo", "__foo2"),
("foo", "__foo3"),
],
),
(
&["___foo4294967293"],
&[
("foo", "___foo4294967294"),
("foo", "___foo4294967295"),
("foo", "____foo2"),
("foo", "____foo3"),
],
),
(&[], &[("_", "_"), ("_", "_2"), ("_", "_3")]),
(
&["_0", "_1", "__0", "____1", "_01", "_012345", "_000000"],
&[("_", "_"), ("_", "_2"), ("_", "_3")],
),
(&[], &[("___", "_"), ("_____", "_2"), ("_____", "_3")]),
(&["_"], &[("_", "_2"), ("_", "_3"), ("_", "_4")]),
(&["_4"], &[("_", "_5"), ("_", "_6"), ("_", "_7")]),
(&["___"], &[("_", "___2"), ("_", "___3"), ("_", "___4")]),
(&["___99"], &[("_", "___100"), ("_", "___101"), ("_", "___102")]),
(&["_"], &[("_123", "_2"), ("__456", "_3"), ("___789", "_4")]),
];

let allocator = Allocator::default();
for &(used_names, created) in cases {
let mut generator = UidGenerator { names: FxHashMap::default(), allocator: &allocator };
for &used_name in used_names {
generator.add(used_name);
}

for &(name, uid) in created {
assert_eq!(generator.create(name), uid);
}
}
}
Loading
Loading