Skip to content
91 changes: 76 additions & 15 deletions turbopack/crates/turbo-rcstr/src/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,37 @@ use std::{num::NonZeroU8, ptr::NonNull};
use triomphe::Arc;

use crate::{
INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, TAG_MASK,
INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, STATIC_TAG, TAG_MASK,
tagged_value::{MAX_INLINE_LEN, TaggedValue},
};

pub(crate) struct PrehashedString {
pub value: String,
pub enum Payload {
String(String),
Ref(&'static str),
}

impl Payload {
pub(crate) fn as_str(&self) -> &str {
match self {
Payload::String(s) => s,
Payload::Ref(s) => s,
}
}
pub(crate) fn into_string(self) -> String {
match self {
Payload::String(s) => s,
Payload::Ref(r) => r.to_string(),
}
}
}
impl PartialEq for Payload {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}

pub struct PrehashedString {
pub value: Payload,
/// This is not the actual `fxhash`, but rather it's a value that passed to
/// `write_u64` of [rustc_hash::FxHasher].
pub hash: u64,
Expand Down Expand Up @@ -46,7 +71,7 @@ pub(crate) fn new_atom<T: AsRef<str> + Into<String>>(text: T) -> RcStr {
let hash = hash_bytes(text.as_ref().as_bytes());

let entry: Arc<PrehashedString> = Arc::new(PrehashedString {
value: text.into(),
value: Payload::String(text.into()),
hash,
});
let entry = Arc::into_raw(entry);
Expand All @@ -61,6 +86,22 @@ pub(crate) fn new_atom<T: AsRef<str> + Into<String>>(text: T) -> RcStr {
}
}

#[inline(always)]
pub(crate) fn new_static_atom(string: &'static PrehashedString) -> RcStr {
let mut entry = string as *const PrehashedString;
debug_assert!(0 == entry as u8 & TAG_MASK);
// Tag it as a static pointer
entry = ((entry as usize) | STATIC_TAG as usize) as *mut PrehashedString;
let ptr: NonNull<PrehashedString> = unsafe {
// Safety: Box::into_raw returns a non-null pointer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The safety comment for NonNull::new_unchecked incorrectly states "Box::into_raw returns a non-null pointer" but this function doesn't use Box::into_raw.

View Details

Analysis

The safety comment on line 96 is misleading and incorrect. The function new_static_atom takes a reference to a static PrehashedString and converts it to a tagged pointer, but it never uses Box::into_raw. The comment appears to be copied from another function.

The actual safety requirement for NonNull::new_unchecked(entry as *mut _) is that entry is non-null. In this case, safety is ensured because:

  1. We start with string as *const PrehashedString where string is a &'static PrehashedString (references are never null)
  2. We then tag it with | STATIC_TAG as usize, which only affects the lower bits and preserves the non-null property
  3. The conversion as *mut PrehashedString and as *mut _ preserves non-null-ness

The incorrect comment could mislead future maintainers about the actual safety invariants.


Recommendation

Replace the incorrect safety comment with an accurate one:

// Safety: entry is non-null because it's derived from a valid reference and tagging preserves non-null-ness
NonNull::new_unchecked(entry as *mut _)

NonNull::new_unchecked(entry as *mut _)
};

RcStr {
unsafe_data: TaggedValue::new_ptr(ptr),
}
}

/// Attempts to construct an RcStr but only if it can be constructed inline.
/// This is primarily useful in constant contexts.
#[doc(hidden)]
Expand Down Expand Up @@ -90,7 +131,7 @@ const SEED2: u64 = 0x13198a2e03707344;
const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;

#[inline]
fn multiply_mix(x: u64, y: u64) -> u64 {
const fn multiply_mix(x: u64, y: u64) -> u64 {
#[cfg(target_pointer_width = "64")]
{
// We compute the full u64 x u64 -> u128 product, this is a single mul
Expand Down Expand Up @@ -131,6 +172,26 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
}
}

// Const compatible helper function to read a u64 from a byte array at a given offset
const fn read_u64_le(bytes: &[u8], offset: usize) -> u64 {
(bytes[offset] as u64)
| ((bytes[offset + 1] as u64) << 8)
| ((bytes[offset + 2] as u64) << 16)
| ((bytes[offset + 3] as u64) << 24)
| ((bytes[offset + 4] as u64) << 32)
| ((bytes[offset + 5] as u64) << 40)
| ((bytes[offset + 6] as u64) << 48)
| ((bytes[offset + 7] as u64) << 56)
}

// Const compatible helper function to read a u32 from a byte array at a given offset
const fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
(bytes[offset] as u32)
| ((bytes[offset + 1] as u32) << 8)
| ((bytes[offset + 2] as u32) << 16)
| ((bytes[offset + 3] as u32) << 24)
}

/// Copied from `hash_bytes` of `rustc-hash`.
///
/// See: https://github.com/rust-lang/rustc-hash/blob/dc5c33f1283de2da64d8d7a06401d91aded03ad4/src/lib.rs#L252-L297
Expand All @@ -149,19 +210,20 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
/// We don't bother avalanching here as we'll feed this hash into a
/// multiplication after which we take the high bits, which avalanches for us.
#[inline]
fn hash_bytes(bytes: &[u8]) -> u64 {
#[doc(hidden)]
pub const fn hash_bytes(bytes: &[u8]) -> u64 {
let len = bytes.len();
let mut s0 = SEED1;
let mut s1 = SEED2;

if len <= 16 {
// XOR the input into s0, s1.
if len >= 8 {
s0 ^= u64::from_le_bytes(bytes[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(bytes[len - 8..].try_into().unwrap());
s0 ^= read_u64_le(bytes, 0);
s1 ^= read_u64_le(bytes, len - 8);
} else if len >= 4 {
s0 ^= u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as u64;
s1 ^= u32::from_le_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
s0 ^= read_u32_le(bytes, 0) as u64;
s1 ^= read_u32_le(bytes, len - 4) as u64;
} else if len > 0 {
let lo = bytes[0];
let mid = bytes[len / 2];
Expand All @@ -173,8 +235,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 {
// Handle bulk (can partially overlap with suffix).
let mut off = 0;
while off < len - 16 {
let x = u64::from_le_bytes(bytes[off..off + 8].try_into().unwrap());
let y = u64::from_le_bytes(bytes[off + 8..off + 16].try_into().unwrap());
let x = read_u64_le(bytes, off);
let y = read_u64_le(bytes, off + 8);

// Replace s1 with a mix of s0, x, and y, and s0 with s1.
// This ensures the compiler can unroll this loop into two
Expand All @@ -188,9 +250,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 {
off += 16;
}

let suffix = &bytes[len - 16..];
s0 ^= u64::from_le_bytes(suffix[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(suffix[8..16].try_into().unwrap());
s0 ^= read_u64_le(bytes, len - 16);
s1 ^= read_u64_le(bytes, len - 8);
}

multiply_mix(s0, s1) ^ (len as u64)
Expand Down
132 changes: 94 additions & 38 deletions turbopack/crates/turbo-rcstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use triomphe::Arc;
use turbo_tasks_hash::{DeterministicHash, DeterministicHasher};

use crate::{
dynamic::{deref_from, new_atom},
dynamic::{deref_from, hash_bytes, new_atom},
tagged_value::TaggedValue,
};

Expand Down Expand Up @@ -71,35 +71,60 @@ pub struct RcStr {
unsafe_data: TaggedValue,
}

const _: () = {
// Enforce that RcStr triggers the non-zero size optimization.
assert!(std::mem::size_of::<RcStr>() == std::mem::size_of::<Option<RcStr>>());
};

unsafe impl Send for RcStr {}
unsafe impl Sync for RcStr {}

// Marks a payload that is stored in an Arc
const DYNAMIC_TAG: u8 = 0b_00;
const DYNAMIC_LOCATION: u8 = 0b_0;
// Marks a payload that has been leaked since it has a static lifetime
const STATIC_TAG: u8 = 0b_10;
// The payload is stored inline
const INLINE_TAG: u8 = 0b_01; // len in upper nybble
const INLINE_LOCATION: u8 = 0b_1;
const INLINE_TAG_INIT: NonZeroU8 = NonZeroU8::new(INLINE_TAG).unwrap();
const TAG_MASK: u8 = 0b_11;
const LOCATION_MASK: u8 = 0b_1;
// For inline tags the length is stored in the upper 4 bits of the tag byte
const LEN_OFFSET: usize = 4;
const LEN_MASK: u8 = 0xf0;

impl RcStr {
#[inline(always)]
fn tag(&self) -> u8 {
self.unsafe_data.tag() & TAG_MASK
self.unsafe_data.tag_byte() & TAG_MASK
}
#[inline(always)]
fn location(&self) -> u8 {
self.unsafe_data.tag_byte() & LOCATION_MASK
}

#[inline(never)]
pub fn as_str(&self) -> &str {
match self.tag() {
DYNAMIC_TAG => unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() },
INLINE_TAG => {
let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
let src = self.unsafe_data.data();
unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
}
match self.location() {
DYNAMIC_LOCATION => self.dynamic_as_str(),
INLINE_LOCATION => self.inline_as_str(),
_ => unsafe { debug_unreachable!() },
}
}

fn inline_as_str(&self) -> &str {
let len = (self.unsafe_data.tag_byte() & LEN_MASK) >> LEN_OFFSET;
let src = self.unsafe_data.data();
unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
}

// Extract the str reference from a string stored in a dynamic location
fn dynamic_as_str(&self) -> &str {
debug_assert!(self.location() == DYNAMIC_LOCATION);
unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() }
}

/// Returns an owned mutable [`String`].
///
/// This implementation is more efficient than [`ToString::to_string`]:
Expand All @@ -113,31 +138,19 @@ impl RcStr {
// convert `self` into `arc`
let arc = unsafe { dynamic::restore_arc(ManuallyDrop::new(self).unsafe_data) };
match Arc::try_unwrap(arc) {
Ok(v) => v.value,
Err(arc) => arc.value.to_string(),
Ok(v) => v.value.into_string(),
Err(arc) => arc.value.as_str().to_string(),
}
}
INLINE_TAG => self.as_str().to_string(),
INLINE_TAG => self.inline_as_str().to_string(),
STATIC_TAG => self.dynamic_as_str().to_string(),
_ => unsafe { debug_unreachable!() },
}
}

pub fn map(self, f: impl FnOnce(String) -> String) -> Self {
RcStr::from(Cow::Owned(f(self.into_owned())))
}

#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}

impl DeterministicHash for RcStr {
Expand Down Expand Up @@ -264,7 +277,16 @@ impl From<RcStr> for PathBuf {
impl Clone for RcStr {
#[inline(always)]
fn clone(&self) -> Self {
Self::from_alias(self.unsafe_data)
let alias = self.unsafe_data;
if alias.tag_byte() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

RcStr { unsafe_data: alias }
}
}

Expand All @@ -276,13 +298,20 @@ impl Default for RcStr {

impl PartialEq for RcStr {
fn eq(&self, other: &Self) -> bool {
match (self.tag(), other.tag()) {
(DYNAMIC_TAG, DYNAMIC_TAG) => {
// For inline RcStrs this is sufficient and for out of line values it handles a simple
// identity cases
if self.unsafe_data == other.unsafe_data {
return true;
}
// They can still be equal if they are both stored on the heap
match (self.location(), other.location()) {
(DYNAMIC_LOCATION, DYNAMIC_LOCATION) => {
let l = unsafe { deref_from(self.unsafe_data) };
let r = unsafe { deref_from(other.unsafe_data) };
l.hash == r.hash && l.value == r.value
}
(INLINE_TAG, INLINE_TAG) => self.unsafe_data == other.unsafe_data,
// NOTE: it is never possible for an inline storage string to compare equal to a dynamic
// allocated string, the construction routines separate the strings based on length.
_ => false,
}
}
Expand All @@ -304,13 +333,13 @@ impl Ord for RcStr {

impl Hash for RcStr {
fn hash<H: Hasher>(&self, state: &mut H) {
match self.tag() {
DYNAMIC_TAG => {
match self.location() {
DYNAMIC_LOCATION => {
let l = unsafe { deref_from(self.unsafe_data) };
state.write_u64(l.hash);
state.write_u8(0xff);
}
INLINE_TAG => {
INLINE_LOCATION => {
self.as_str().hash(state);
}
_ => unsafe { debug_unreachable!() },
Expand Down Expand Up @@ -339,27 +368,45 @@ impl Drop for RcStr {
}
}

// Exports for our macro
Copy link
Contributor

@vercel vercel bot Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Drop implementation for RcStr doesn't handle STATIC_TAG properly, which could lead to incorrect behavior when dropping static strings.

View Details

Analysis

The Drop implementation only checks for DYNAMIC_TAG and calls restore_arc for those cases, but it doesn't handle the new STATIC_TAG (0b_10) that was introduced in this PR. Static strings are tagged with STATIC_TAG and point to statically allocated PrehashedString structs that should never be dropped via restore_arc since they are not Arc-managed.

Currently, when a static string is dropped, the Drop implementation does nothing (since it only matches DYNAMIC_TAG), which is actually correct behavior for static strings. However, this creates a subtle dependency on the exact tag values that is not explicitly documented or enforced.

The issue is that the code doesn't make it clear that static strings should not be processed in the Drop implementation, and future changes to the tag system could break this assumption.


Recommendation

Update the Drop implementation to explicitly handle all tag types:

impl Drop for RcStr {
    fn drop(&mut self) {
        match self.tag() {
            DYNAMIC_TAG => unsafe { drop(dynamic::restore_arc(self.unsafe_data)) },
            STATIC_TAG => {
                // Static strings are never dropped - they have static lifetime
                // No action needed
            },
            INLINE_TAG => {
                // Inline strings are stored directly in the tagged value
                // No action needed
            },
            _ => unsafe { debug_unreachable!() },
        }
    }
}

This makes the behavior explicit and prevents future bugs if tag values change.

#[doc(hidden)]
pub const fn inline_atom(s: &str) -> Option<RcStr> {
dynamic::inline_atom(s)
}

#[doc(hidden)]
#[inline(always)]
pub fn from_static(s: &'static PrehashedString) -> RcStr {
dynamic::new_static_atom(s)
}
#[doc(hidden)]
pub use dynamic::PrehashedString;

#[doc(hidden)]
pub const fn make_const_prehashed_string(text: &'static str) -> PrehashedString {
PrehashedString {
value: dynamic::Payload::Ref(text),
hash: hash_bytes(text.as_bytes()),
}
}

/// Create an rcstr from a string literal.
/// allocates the RcStr inline when possible otherwise uses a `LazyLock` to manage the allocation.
#[macro_export]
macro_rules! rcstr {
($s:expr) => {{
const INLINE: core::option::Option<$crate::RcStr> = $crate::inline_atom($s);
// this condition should be able to be compile time evaluated and inlined.
// This condition can be compile time evaluated and inlined.
if INLINE.is_some() {
INLINE.unwrap()
} else {
#[inline(never)]
fn get_rcstr() -> $crate::RcStr {
static CACHE: std::sync::LazyLock<$crate::RcStr> =
std::sync::LazyLock::new(|| $crate::RcStr::from($s));

(*CACHE).clone()
// Allocate static storage for the PrehashedString
static RCSTR_STORAGE: $crate::PrehashedString =
$crate::make_const_prehashed_string($s);
// This basically just tags a bit onto the raw pointer and wraps it in an RcStr
// should be fast enough to do every time.
$crate::from_static(&RCSTR_STORAGE)
}
get_rcstr()
}
Expand Down Expand Up @@ -460,6 +507,15 @@ mod tests {
assert_eq!(rcstr!("abcdefghi"), RcStr::from("abcdefghi"));
}

#[test]
fn test_static_atom() {
const LONG: &str = "a very long string that lives forever";
let leaked = rcstr!(LONG);
let not_leaked = RcStr::from(LONG);
assert_ne!(leaked.tag(), not_leaked.tag());
assert_eq!(leaked, not_leaked);
}

#[test]
fn test_inline_atom() {
// This is a silly test, just asserts that we can evaluate this in a constant context.
Expand Down
Loading
Loading