diff --git a/turbopack/crates/turbo-rcstr/src/dynamic.rs b/turbopack/crates/turbo-rcstr/src/dynamic.rs index 7e1c34a2e6bb21..31b96beee01da8 100644 --- a/turbopack/crates/turbo-rcstr/src/dynamic.rs +++ b/turbopack/crates/turbo-rcstr/src/dynamic.rs @@ -3,12 +3,37 @@ use std::{num::NonZeroU8, ptr::NonNull}; use triomphe::Arc; use crate::{ - INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, TAG_MASK, + INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, RcStr, STATIC_TAG, TAG_MASK, tagged_value::{MAX_INLINE_LEN, TaggedValue}, }; -pub(crate) struct PrehashedString { - pub value: String, +pub enum Payload { + String(String), + Ref(&'static str), +} + +impl Payload { + pub(crate) fn as_str(&self) -> &str { + match self { + Payload::String(s) => s, + Payload::Ref(s) => s, + } + } + pub(crate) fn into_string(self) -> String { + match self { + Payload::String(s) => s, + Payload::Ref(r) => r.to_string(), + } + } +} +impl PartialEq for Payload { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +pub struct PrehashedString { + pub value: Payload, /// This is not the actual `fxhash`, but rather it's a value that passed to /// `write_u64` of [rustc_hash::FxHasher]. pub hash: u64, @@ -46,7 +71,7 @@ pub(crate) fn new_atom + Into>(text: T) -> RcStr { let hash = hash_bytes(text.as_ref().as_bytes()); let entry: Arc = Arc::new(PrehashedString { - value: text.into(), + value: Payload::String(text.into()), hash, }); let entry = Arc::into_raw(entry); @@ -61,6 +86,22 @@ pub(crate) fn new_atom + Into>(text: T) -> RcStr { } } +#[inline(always)] +pub(crate) fn new_static_atom(string: &'static PrehashedString) -> RcStr { + let mut entry = string as *const PrehashedString; + debug_assert!(0 == entry as u8 & TAG_MASK); + // Tag it as a static pointer + entry = ((entry as usize) | STATIC_TAG as usize) as *mut PrehashedString; + let ptr: NonNull = unsafe { + // Safety: Box::into_raw returns a non-null pointer + NonNull::new_unchecked(entry as *mut _) + }; + + RcStr { + unsafe_data: TaggedValue::new_ptr(ptr), + } +} + /// Attempts to construct an RcStr but only if it can be constructed inline. /// This is primarily useful in constant contexts. #[doc(hidden)] @@ -90,7 +131,7 @@ const SEED2: u64 = 0x13198a2e03707344; const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0; #[inline] -fn multiply_mix(x: u64, y: u64) -> u64 { +const fn multiply_mix(x: u64, y: u64) -> u64 { #[cfg(target_pointer_width = "64")] { // We compute the full u64 x u64 -> u128 product, this is a single mul @@ -131,6 +172,26 @@ fn multiply_mix(x: u64, y: u64) -> u64 { } } +// Const compatible helper function to read a u64 from a byte array at a given offset +const fn read_u64_le(bytes: &[u8], offset: usize) -> u64 { + (bytes[offset] as u64) + | ((bytes[offset + 1] as u64) << 8) + | ((bytes[offset + 2] as u64) << 16) + | ((bytes[offset + 3] as u64) << 24) + | ((bytes[offset + 4] as u64) << 32) + | ((bytes[offset + 5] as u64) << 40) + | ((bytes[offset + 6] as u64) << 48) + | ((bytes[offset + 7] as u64) << 56) +} + +// Const compatible helper function to read a u32 from a byte array at a given offset +const fn read_u32_le(bytes: &[u8], offset: usize) -> u32 { + (bytes[offset] as u32) + | ((bytes[offset + 1] as u32) << 8) + | ((bytes[offset + 2] as u32) << 16) + | ((bytes[offset + 3] as u32) << 24) +} + /// Copied from `hash_bytes` of `rustc-hash`. /// /// See: https://github.com/rust-lang/rustc-hash/blob/dc5c33f1283de2da64d8d7a06401d91aded03ad4/src/lib.rs#L252-L297 @@ -149,7 +210,8 @@ fn multiply_mix(x: u64, y: u64) -> u64 { /// We don't bother avalanching here as we'll feed this hash into a /// multiplication after which we take the high bits, which avalanches for us. #[inline] -fn hash_bytes(bytes: &[u8]) -> u64 { +#[doc(hidden)] +pub const fn hash_bytes(bytes: &[u8]) -> u64 { let len = bytes.len(); let mut s0 = SEED1; let mut s1 = SEED2; @@ -157,11 +219,11 @@ fn hash_bytes(bytes: &[u8]) -> u64 { if len <= 16 { // XOR the input into s0, s1. if len >= 8 { - s0 ^= u64::from_le_bytes(bytes[0..8].try_into().unwrap()); - s1 ^= u64::from_le_bytes(bytes[len - 8..].try_into().unwrap()); + s0 ^= read_u64_le(bytes, 0); + s1 ^= read_u64_le(bytes, len - 8); } else if len >= 4 { - s0 ^= u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as u64; - s1 ^= u32::from_le_bytes(bytes[len - 4..].try_into().unwrap()) as u64; + s0 ^= read_u32_le(bytes, 0) as u64; + s1 ^= read_u32_le(bytes, len - 4) as u64; } else if len > 0 { let lo = bytes[0]; let mid = bytes[len / 2]; @@ -173,8 +235,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 { // Handle bulk (can partially overlap with suffix). let mut off = 0; while off < len - 16 { - let x = u64::from_le_bytes(bytes[off..off + 8].try_into().unwrap()); - let y = u64::from_le_bytes(bytes[off + 8..off + 16].try_into().unwrap()); + let x = read_u64_le(bytes, off); + let y = read_u64_le(bytes, off + 8); // Replace s1 with a mix of s0, x, and y, and s0 with s1. // This ensures the compiler can unroll this loop into two @@ -188,9 +250,8 @@ fn hash_bytes(bytes: &[u8]) -> u64 { off += 16; } - let suffix = &bytes[len - 16..]; - s0 ^= u64::from_le_bytes(suffix[0..8].try_into().unwrap()); - s1 ^= u64::from_le_bytes(suffix[8..16].try_into().unwrap()); + s0 ^= read_u64_le(bytes, len - 16); + s1 ^= read_u64_le(bytes, len - 8); } multiply_mix(s0, s1) ^ (len as u64) diff --git a/turbopack/crates/turbo-rcstr/src/lib.rs b/turbopack/crates/turbo-rcstr/src/lib.rs index 9839d224c32d38..b43c61fbf2593a 100644 --- a/turbopack/crates/turbo-rcstr/src/lib.rs +++ b/turbopack/crates/turbo-rcstr/src/lib.rs @@ -17,7 +17,7 @@ use triomphe::Arc; use turbo_tasks_hash::{DeterministicHash, DeterministicHasher}; use crate::{ - dynamic::{deref_from, new_atom}, + dynamic::{deref_from, hash_bytes, new_atom}, tagged_value::TaggedValue, }; @@ -71,35 +71,61 @@ pub struct RcStr { unsafe_data: TaggedValue, } +const _: () = { + // Enforce that RcStr triggers the non-zero size optimization. + assert!(std::mem::size_of::() == std::mem::size_of::>()); +}; + unsafe impl Send for RcStr {} unsafe impl Sync for RcStr {} +// Marks a payload that is stored in an Arc const DYNAMIC_TAG: u8 = 0b_00; +const PREHASHED_STRING_LOCATION: u8 = 0b_0; +// Marks a payload that has been leaked since it has a static lifetime +const STATIC_TAG: u8 = 0b_10; +// The payload is stored inline const INLINE_TAG: u8 = 0b_01; // len in upper nybble +const INLINE_LOCATION: u8 = 0b_1; const INLINE_TAG_INIT: NonZeroU8 = NonZeroU8::new(INLINE_TAG).unwrap(); const TAG_MASK: u8 = 0b_11; +const LOCATION_MASK: u8 = 0b_1; +// For inline tags the length is stored in the upper 4 bits of the tag byte const LEN_OFFSET: usize = 4; const LEN_MASK: u8 = 0xf0; impl RcStr { #[inline(always)] fn tag(&self) -> u8 { - self.unsafe_data.tag() & TAG_MASK + self.unsafe_data.tag_byte() & TAG_MASK + } + #[inline(always)] + fn location(&self) -> u8 { + self.unsafe_data.tag_byte() & LOCATION_MASK } #[inline(never)] pub fn as_str(&self) -> &str { - match self.tag() { - DYNAMIC_TAG => unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() }, - INLINE_TAG => { - let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; - let src = self.unsafe_data.data(); - unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) } - } + match self.location() { + PREHASHED_STRING_LOCATION => self.prehashed_string_as_str(), + INLINE_LOCATION => self.inline_as_str(), _ => unsafe { debug_unreachable!() }, } } + fn inline_as_str(&self) -> &str { + debug_assert!(self.location() == INLINE_LOCATION); + let len = (self.unsafe_data.tag_byte() & LEN_MASK) >> LEN_OFFSET; + let src = self.unsafe_data.data(); + unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) } + } + + // Extract the str reference from a string stored in a PrehashedString + fn prehashed_string_as_str(&self) -> &str { + debug_assert!(self.location() == PREHASHED_STRING_LOCATION); + unsafe { dynamic::deref_from(self.unsafe_data).value.as_str() } + } + /// Returns an owned mutable [`String`]. /// /// This implementation is more efficient than [`ToString::to_string`]: @@ -113,11 +139,12 @@ impl RcStr { // convert `self` into `arc` let arc = unsafe { dynamic::restore_arc(ManuallyDrop::new(self).unsafe_data) }; match Arc::try_unwrap(arc) { - Ok(v) => v.value, - Err(arc) => arc.value.to_string(), + Ok(v) => v.value.into_string(), + Err(arc) => arc.value.as_str().to_string(), } } - INLINE_TAG => self.as_str().to_string(), + INLINE_TAG => self.inline_as_str().to_string(), + STATIC_TAG => self.prehashed_string_as_str().to_string(), _ => unsafe { debug_unreachable!() }, } } @@ -125,19 +152,6 @@ impl RcStr { pub fn map(self, f: impl FnOnce(String) -> String) -> Self { RcStr::from(Cow::Owned(f(self.into_owned()))) } - - #[inline] - pub(crate) fn from_alias(alias: TaggedValue) -> Self { - if alias.tag() & TAG_MASK == DYNAMIC_TAG { - unsafe { - let arc = dynamic::restore_arc(alias); - forget(arc.clone()); - forget(arc); - } - } - - Self { unsafe_data: alias } - } } impl DeterministicHash for RcStr { @@ -264,7 +278,18 @@ impl From for PathBuf { impl Clone for RcStr { #[inline(always)] fn clone(&self) -> Self { - Self::from_alias(self.unsafe_data) + let alias = self.unsafe_data; + // We only need to increment the ref count for DYNAMIC_TAG values + // For STATIC_TAG and INLINE_TAG we can just copy the value. + if alias.tag_byte() & TAG_MASK == DYNAMIC_TAG { + unsafe { + let arc = dynamic::restore_arc(alias); + forget(arc.clone()); + forget(arc); + } + } + + RcStr { unsafe_data: alias } } } @@ -276,13 +301,20 @@ impl Default for RcStr { impl PartialEq for RcStr { fn eq(&self, other: &Self) -> bool { - match (self.tag(), other.tag()) { - (DYNAMIC_TAG, DYNAMIC_TAG) => { + // For inline RcStrs this is sufficient and for out of line values it handles a simple + // identity cases + if self.unsafe_data == other.unsafe_data { + return true; + } + // They can still be equal if they are both stored on the heap + match (self.location(), other.location()) { + (PREHASHED_STRING_LOCATION, PREHASHED_STRING_LOCATION) => { let l = unsafe { deref_from(self.unsafe_data) }; let r = unsafe { deref_from(other.unsafe_data) }; l.hash == r.hash && l.value == r.value } - (INLINE_TAG, INLINE_TAG) => self.unsafe_data == other.unsafe_data, + // NOTE: it is never possible for an inline storage string to compare equal to a dynamic + // allocated string, the construction routines separate the strings based on length. _ => false, } } @@ -304,13 +336,13 @@ impl Ord for RcStr { impl Hash for RcStr { fn hash(&self, state: &mut H) { - match self.tag() { - DYNAMIC_TAG => { + match self.location() { + PREHASHED_STRING_LOCATION => { let l = unsafe { deref_from(self.unsafe_data) }; state.write_u64(l.hash); state.write_u8(0xff); } - INLINE_TAG => { + INLINE_LOCATION => { self.as_str().hash(state); } _ => unsafe { debug_unreachable!() }, @@ -333,33 +365,58 @@ impl<'de> Deserialize<'de> for RcStr { impl Drop for RcStr { fn drop(&mut self) { - if self.tag() == DYNAMIC_TAG { - unsafe { drop(dynamic::restore_arc(self.unsafe_data)) } + match self.tag() { + DYNAMIC_TAG => unsafe { drop(dynamic::restore_arc(self.unsafe_data)) }, + STATIC_TAG => { + // do nothing, these are never deallocated + } + INLINE_TAG => { + // do nothing, these payloads need no drop logic + } + _ => unsafe { debug_unreachable!() }, } } } +// Exports for our macro #[doc(hidden)] pub const fn inline_atom(s: &str) -> Option { dynamic::inline_atom(s) } +#[doc(hidden)] +#[inline(always)] +pub fn from_static(s: &'static PrehashedString) -> RcStr { + dynamic::new_static_atom(s) +} +#[doc(hidden)] +pub use dynamic::PrehashedString; + +#[doc(hidden)] +pub const fn make_const_prehashed_string(text: &'static str) -> PrehashedString { + PrehashedString { + value: dynamic::Payload::Ref(text), + hash: hash_bytes(text.as_bytes()), + } +} + /// Create an rcstr from a string literal. /// allocates the RcStr inline when possible otherwise uses a `LazyLock` to manage the allocation. #[macro_export] macro_rules! rcstr { ($s:expr) => {{ const INLINE: core::option::Option<$crate::RcStr> = $crate::inline_atom($s); - // this condition should be able to be compile time evaluated and inlined. + // This condition can be compile time evaluated and inlined. if INLINE.is_some() { INLINE.unwrap() } else { - #[inline(never)] fn get_rcstr() -> $crate::RcStr { - static CACHE: std::sync::LazyLock<$crate::RcStr> = - std::sync::LazyLock::new(|| $crate::RcStr::from($s)); - - (*CACHE).clone() + // Allocate static storage for the PrehashedString + static RCSTR_STORAGE: $crate::PrehashedString = + $crate::make_const_prehashed_string($s); + // This basically just tags a bit onto the raw pointer and wraps it in an RcStr + // should be fast enough to do every time. + $crate::from_static(&RCSTR_STORAGE) } get_rcstr() } @@ -460,6 +517,15 @@ mod tests { assert_eq!(rcstr!("abcdefghi"), RcStr::from("abcdefghi")); } + #[test] + fn test_static_atom() { + const LONG: &str = "a very long string that lives forever"; + let leaked = rcstr!(LONG); + let not_leaked = RcStr::from(LONG); + assert_ne!(leaked.tag(), not_leaked.tag()); + assert_eq!(leaked, not_leaked); + } + #[test] fn test_inline_atom() { // This is a silly test, just asserts that we can evaluate this in a constant context. diff --git a/turbopack/crates/turbo-rcstr/src/tagged_value.rs b/turbopack/crates/turbo-rcstr/src/tagged_value.rs index aa75def5544693..d1329535aea69f 100644 --- a/turbopack/crates/turbo-rcstr/src/tagged_value.rs +++ b/turbopack/crates/turbo-rcstr/src/tagged_value.rs @@ -3,6 +3,13 @@ use std::{num::NonZeroU8, os::raw::c_void, ptr::NonNull, slice}; use self::raw_types::*; +#[cfg(not(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" +)))] +use crate::TAG_MASK; #[cfg(feature = "atom_size_128")] mod raw_types { @@ -89,7 +96,9 @@ impl TaggedValue { feature = "atom_size_128" ))] { - self.value.get() as usize as _ + use crate::TAG_MASK; + + (self.value.get() as usize & !(TAG_MASK as usize)) as _ } #[cfg(not(any( target_pointer_width = "32", @@ -97,8 +106,8 @@ impl TaggedValue { feature = "atom_size_64", feature = "atom_size_128" )))] - unsafe { - std::mem::transmute(Some(self.value)) + { + (self.value.as_ptr() as usize & !(TAG_MASK as usize)) as _ } } @@ -108,7 +117,7 @@ impl TaggedValue { } #[inline(always)] - pub fn tag(&self) -> u8 { + pub fn tag_byte(&self) -> u8 { (self.get_value() & 0xff) as u8 }