Skip to content
62 changes: 49 additions & 13 deletions turbopack/crates/turbo-rcstr/src/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ use std::{num::NonZeroU8, ptr::NonNull};
use triomphe::Arc;

use crate::{
INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, TAG_MASK,
INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, STATIC_TAG, TAG_MASK,
tagged_value::{MAX_INLINE_LEN, TaggedValue},
};

pub(crate) struct PrehashedString {
pub struct PrehashedString {
pub value: String,
/// This is not the actual `fxhash`, but rather it's a value that passed to
/// `write_u64` of [rustc_hash::FxHasher].
Expand Down Expand Up @@ -61,6 +61,22 @@ pub(crate) fn new_atom<T: AsRef<str> + Into<String>>(text: T) -> RcStr {
}
}

#[inline(always)]
pub(crate) fn new_static_atom(string: &'static PrehashedString) -> RcStr {
let mut entry = string as *const PrehashedString;
debug_assert!(0 == entry as u8 & TAG_MASK);
// Tag it as a static pointer
entry = ((entry as usize) | STATIC_TAG as usize) as *mut PrehashedString;
let ptr: NonNull<PrehashedString> = unsafe {
// Safety: Box::into_raw returns a non-null pointer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The safety comment for NonNull::new_unchecked incorrectly states "Box::into_raw returns a non-null pointer" but this function doesn't use Box::into_raw.

View Details

Analysis

The safety comment on line 96 is misleading and incorrect. The function new_static_atom takes a reference to a static PrehashedString and converts it to a tagged pointer, but it never uses Box::into_raw. The comment appears to be copied from another function.

The actual safety requirement for NonNull::new_unchecked(entry as *mut _) is that entry is non-null. In this case, safety is ensured because:

  1. We start with string as *const PrehashedString where string is a &'static PrehashedString (references are never null)
  2. We then tag it with | STATIC_TAG as usize, which only affects the lower bits and preserves the non-null property
  3. The conversion as *mut PrehashedString and as *mut _ preserves non-null-ness

The incorrect comment could mislead future maintainers about the actual safety invariants.


Recommendation

Replace the incorrect safety comment with an accurate one:

// Safety: entry is non-null because it's derived from a valid reference and tagging preserves non-null-ness
NonNull::new_unchecked(entry as *mut _)

NonNull::new_unchecked(entry as *mut _)
};

RcStr {
unsafe_data: TaggedValue::new_ptr(ptr),
}
}

/// Attempts to construct an RcStr but only if it can be constructed inline.
/// This is primarily useful in constant contexts.
#[doc(hidden)]
Expand Down Expand Up @@ -90,7 +106,7 @@ const SEED2: u64 = 0x13198a2e03707344;
const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;

#[inline]
fn multiply_mix(x: u64, y: u64) -> u64 {
const fn multiply_mix(x: u64, y: u64) -> u64 {
#[cfg(target_pointer_width = "64")]
{
// We compute the full u64 x u64 -> u128 product, this is a single mul
Expand Down Expand Up @@ -131,6 +147,26 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
}
}

// Const compatible helper function to read a u64 from a byte array at a given offset
const fn read_u64_le(bytes: &[u8], offset: usize) -> u64 {
(bytes[offset] as u64)
| ((bytes[offset + 1] as u64) << 8)
| ((bytes[offset + 2] as u64) << 16)
| ((bytes[offset + 3] as u64) << 24)
| ((bytes[offset + 4] as u64) << 32)
| ((bytes[offset + 5] as u64) << 40)
| ((bytes[offset + 6] as u64) << 48)
| ((bytes[offset + 7] as u64) << 56)
}

// Const compatible helper function to read a u32 from a byte array at a given offset
const fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
(bytes[offset] as u32)
| ((bytes[offset + 1] as u32) << 8)
| ((bytes[offset + 2] as u32) << 16)
| ((bytes[offset + 3] as u32) << 24)
}

/// Copied from `hash_bytes` of `rustc-hash`.
///
/// See: https://github.com/rust-lang/rustc-hash/blob/dc5c33f1283de2da64d8d7a06401d91aded03ad4/src/lib.rs#L252-L297
Expand All @@ -149,19 +185,20 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
/// We don't bother avalanching here as we'll feed this hash into a
/// multiplication after which we take the high bits, which avalanches for us.
#[inline]
fn hash_bytes(bytes: &[u8]) -> u64 {
#[doc(hidden)]
pub const fn hash_bytes(bytes: &[u8]) -> u64 {
let len = bytes.len();
let mut s0 = SEED1;
let mut s1 = SEED2;

if len <= 16 {
// XOR the input into s0, s1.
if len >= 8 {
s0 ^= u64::from_le_bytes(bytes[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(bytes[len - 8..].try_into().unwrap());
s0 ^= read_u64_le(bytes, 0);
s1 ^= read_u64_le(bytes, len - 8);
} else if len >= 4 {
s0 ^= u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as u64;
s1 ^= u32::from_le_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
s0 ^= read_u32_le(bytes, 0) as u64;
s1 ^= read_u32_le(bytes, len - 4) as u64;
} else if len > 0 {
let lo = bytes[0];
let mid = bytes[len / 2];
Expand All @@ -173,8 +210,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 {
// Handle bulk (can partially overlap with suffix).
let mut off = 0;
while off < len - 16 {
let x = u64::from_le_bytes(bytes[off..off + 8].try_into().unwrap());
let y = u64::from_le_bytes(bytes[off + 8..off + 16].try_into().unwrap());
let x = read_u64_le(bytes, off);
let y = read_u64_le(bytes, off + 8);

// Replace s1 with a mix of s0, x, and y, and s0 with s1.
// This ensures the compiler can unroll this loop into two
Expand All @@ -188,9 +225,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 {
off += 16;
}

let suffix = &bytes[len - 16..];
s0 ^= u64::from_le_bytes(suffix[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(suffix[8..16].try_into().unwrap());
s0 ^= read_u64_le(bytes, len - 16);
s1 ^= read_u64_le(bytes, len - 8);
}

multiply_mix(s0, s1) ^ (len as u64)
Expand Down
146 changes: 109 additions & 37 deletions turbopack/crates/turbo-rcstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,35 +71,60 @@ pub struct RcStr {
unsafe_data: TaggedValue,
}

const _: () = {
// Enforce that RcStr triggers the non-zero size optimization.
assert!(std::mem::size_of::<RcStr>() == std::mem::size_of::<Option<RcStr>>());
};

unsafe impl Send for RcStr {}
unsafe impl Sync for RcStr {}

// Marks a payload that is stored in an Arc
const DYNAMIC_TAG: u8 = 0b_00;
const DYNAMIC_LOCATION: u8 = 0b_0;
// Marks a payload that has been leaked since it has a static lifetime
const STATIC_TAG: u8 = 0b_10;
// The payload is stored inline
const INLINE_TAG: u8 = 0b_01; // len in upper nybble
const INLINE_LOCATION: u8 = 0b_1;
const INLINE_TAG_INIT: NonZeroU8 = NonZeroU8::new(INLINE_TAG).unwrap();
const TAG_MASK: u8 = 0b_11;
const LOCATION_MASK: u8 = 0b_1;
// For inline tags the length is stored in the upper 4 bits of the tag byte
const LEN_OFFSET: usize = 4;
const LEN_MASK: u8 = 0xf0;

impl RcStr {
#[inline(always)]
fn tag(&self) -> u8 {
self.unsafe_data.tag() & TAG_MASK
self.unsafe_data.tag_byte() & TAG_MASK
}
#[inline(always)]
fn location(&self) -> u8 {
self.unsafe_data.tag_byte() & LOCATION_MASK
}

#[inline(never)]
pub fn as_str(&self) -> &str {
match self.tag() {
DYNAMIC_TAG => unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() },
INLINE_TAG => {
let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
let src = self.unsafe_data.data();
unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
}
match self.location() {
DYNAMIC_LOCATION => self.dynamic_as_str(),
INLINE_LOCATION => self.inline_as_str(),
_ => unsafe { debug_unreachable!() },
}
}

fn inline_as_str(&self) -> &str {
let len = (self.unsafe_data.tag_byte() & LEN_MASK) >> LEN_OFFSET;
let src = self.unsafe_data.data();
unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
}

// Extract the str reference from a string stored in a dynamic location
fn dynamic_as_str(&self) -> &str {
debug_assert!(self.location() == DYNAMIC_LOCATION);
unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() }
}

/// Returns an owned mutable [`String`].
///
/// This implementation is more efficient than [`ToString::to_string`]:
Expand All @@ -114,30 +139,18 @@ impl RcStr {
let arc = unsafe { dynamic::restore_arc(ManuallyDrop::new(self).unsafe_data) };
match Arc::try_unwrap(arc) {
Ok(v) => v.value,
Err(arc) => arc.value.to_string(),
Err(arc) => arc.value.clone(),
}
}
INLINE_TAG => self.as_str().to_string(),
INLINE_TAG => self.inline_as_str().to_string(),
STATIC_TAG => self.dynamic_as_str().to_string(),
_ => unsafe { debug_unreachable!() },
}
}

pub fn map(self, f: impl FnOnce(String) -> String) -> Self {
RcStr::from(Cow::Owned(f(self.into_owned())))
}

#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}

impl DeterministicHash for RcStr {
Expand Down Expand Up @@ -264,7 +277,16 @@ impl From<RcStr> for PathBuf {
impl Clone for RcStr {
#[inline(always)]
fn clone(&self) -> Self {
Self::from_alias(self.unsafe_data)
let alias = self.unsafe_data;
if alias.tag_byte() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

RcStr { unsafe_data: alias }
}
}

Expand All @@ -276,13 +298,18 @@ impl Default for RcStr {

impl PartialEq for RcStr {
fn eq(&self, other: &Self) -> bool {
match (self.tag(), other.tag()) {
(DYNAMIC_TAG, DYNAMIC_TAG) => {
// For inline RcStrs this is sufficient and for out of line values it handles a simple
// identity cases
if self.unsafe_data == other.unsafe_data {
return true;
}
// They can still be equal if they are both stored on the heap
match (self.location(), other.location()) {
(DYNAMIC_LOCATION, DYNAMIC_LOCATION) => {
let l = unsafe { deref_from(self.unsafe_data) };
let r = unsafe { deref_from(other.unsafe_data) };
l.hash == r.hash && l.value == r.value
}
(INLINE_TAG, INLINE_TAG) => self.unsafe_data == other.unsafe_data,
_ => false,
}
}
Expand All @@ -304,13 +331,13 @@ impl Ord for RcStr {

impl Hash for RcStr {
fn hash<H: Hasher>(&self, state: &mut H) {
match self.tag() {
DYNAMIC_TAG => {
match self.location() {
DYNAMIC_LOCATION => {
let l = unsafe { deref_from(self.unsafe_data) };
state.write_u64(l.hash);
state.write_u8(0xff);
}
INLINE_TAG => {
INLINE_LOCATION => {
self.as_str().hash(state);
}
_ => unsafe { debug_unreachable!() },
Expand All @@ -333,33 +360,69 @@ impl<'de> Deserialize<'de> for RcStr {

impl Drop for RcStr {
fn drop(&mut self) {
if self.tag() == DYNAMIC_TAG {
unsafe { drop(dynamic::restore_arc(self.unsafe_data)) }
let alias = self.unsafe_data;
if alias.tag_byte() & TAG_MASK == DYNAMIC_TAG {
unsafe { drop(dynamic::restore_arc(alias)) }
}
}
}

// Exports for our macro
Copy link
Contributor

@vercel vercel bot Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Drop implementation for RcStr doesn't handle STATIC_TAG properly, which could lead to incorrect behavior when dropping static strings.

View Details

Analysis

The Drop implementation only checks for DYNAMIC_TAG and calls restore_arc for those cases, but it doesn't handle the new STATIC_TAG (0b_10) that was introduced in this PR. Static strings are tagged with STATIC_TAG and point to statically allocated PrehashedString structs that should never be dropped via restore_arc since they are not Arc-managed.

Currently, when a static string is dropped, the Drop implementation does nothing (since it only matches DYNAMIC_TAG), which is actually correct behavior for static strings. However, this creates a subtle dependency on the exact tag values that is not explicitly documented or enforced.

The issue is that the code doesn't make it clear that static strings should not be processed in the Drop implementation, and future changes to the tag system could break this assumption.


Recommendation

Update the Drop implementation to explicitly handle all tag types:

impl Drop for RcStr {
    fn drop(&mut self) {
        match self.tag() {
            DYNAMIC_TAG => unsafe { drop(dynamic::restore_arc(self.unsafe_data)) },
            STATIC_TAG => {
                // Static strings are never dropped - they have static lifetime
                // No action needed
            },
            INLINE_TAG => {
                // Inline strings are stored directly in the tagged value
                // No action needed
            },
            _ => unsafe { debug_unreachable!() },
        }
    }
}

This makes the behavior explicit and prevents future bugs if tag values change.

#[doc(hidden)]
pub const fn inline_atom(s: &str) -> Option<RcStr> {
dynamic::inline_atom(s)
}

#[doc(hidden)]
pub fn from_static(s: &'static PrehashedString) -> RcStr {
dynamic::new_static_atom(s)
}
#[doc(hidden)]
pub use dynamic::{PrehashedString, hash_bytes};

#[doc(hidden)]
impl RcStr {
// Allow the rcstr! macro to skip a tag branch and just copy the struct
#[doc(hidden)]
pub fn unsafe_copy_for_macro(&self) -> RcStr {
debug_assert!(self.tag() == STATIC_TAG);
Self {
unsafe_data: self.unsafe_data,
}
}
}

/// Create an rcstr from a string literal.
/// allocates the RcStr inline when possible otherwise uses a `LazyLock` to manage the allocation.
#[macro_export]
macro_rules! rcstr {
($s:expr) => {{
const INLINE: core::option::Option<$crate::RcStr> = $crate::inline_atom($s);
// this condition should be able to be compile time evaluated and inlined.
// This condition can be compile time evaluated and inlined.
if INLINE.is_some() {
INLINE.unwrap()
} else {
#[inline(never)]
fn get_rcstr() -> $crate::RcStr {
static CACHE: std::sync::LazyLock<$crate::RcStr> =
std::sync::LazyLock::new(|| $crate::RcStr::from($s));

(*CACHE).clone()
// Allocate static storage for the PrehashedString
static INNER: ::std::sync::LazyLock<$crate::PrehashedString> =
::std::sync::LazyLock::new(|| $crate::PrehashedString {
// `String::from_raw_parts`` would be a pretty legit option here since
// we could const allocate this. However, we would need to:
// * cast the static str to a *mut u8
// * convince ourselves that the admonitions to supply allocator allocated
// pointers are only about drop.
// Alternatively we could transmute a Vec allocated from from_raw_parts_in
// using an allocator that never frees.
// For how we just copy and allocate
value: $s.into(),
// compute the hash at compile time
hash: const { $crate::hash_bytes($s.as_bytes()) },
});
static CACHE: ::std::sync::LazyLock<$crate::RcStr> =
::std::sync::LazyLock::new(|| $crate::from_static(&*INNER));

(*CACHE).unsafe_copy_for_macro()
}
get_rcstr()
}
Expand Down Expand Up @@ -460,6 +523,15 @@ mod tests {
assert_eq!(rcstr!("abcdefghi"), RcStr::from("abcdefghi"));
}

#[test]
fn test_static_atom() {
const LONG: &str = "a very long string that lives forever";
let leaked = rcstr!(LONG);
let not_leaked = RcStr::from(LONG);
assert_ne!(leaked.tag(), not_leaked.tag());
assert_eq!(leaked, not_leaked);
}

#[test]
fn test_inline_atom() {
// This is a silly test, just asserts that we can evaluate this in a constant context.
Expand Down
Loading
Loading