diff --git a/integration-tests/build.rs b/integration-tests/build.rs index f155a35..6293e4c 100644 --- a/integration-tests/build.rs +++ b/integration-tests/build.rs @@ -17,6 +17,9 @@ fn main() { "html", "head", "id", + "❤", + "❤💯", + "❤💯❤💯", ]) .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) .unwrap() diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs index 81f8406..a788d93 100644 --- a/integration-tests/src/lib.rs +++ b/integration-tests/src/lib.rs @@ -47,10 +47,10 @@ fn test_types() { assert!(Atom::from("").is_static()); assert!(Atom::from("defaults").is_static()); assert!(Atom::from("font-weight").is_static()); - assert!(Atom::from("id").is_static()); - assert!(Atom::from("body").is_static()); - assert!(Atom::from("a").is_static()); - assert!(Atom::from("address").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); assert!(Atom::from("c").is_inline()); assert!(Atom::from("zz").is_inline()); assert!(Atom::from("zzz").is_inline()); @@ -173,11 +173,11 @@ fn repr() { // Static atoms check_static("defaults", test_atom!("defaults")); check_static("font-weight", test_atom!("font-weight")); - check_static("a", test_atom!("a")); - check_static("address", test_atom!("address")); - check_static("area", test_atom!("area")); // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); check("e", 0x0000_0000_0000_6511); check("xyzzy", 0x0000_797A_7A79_7851); check("xyzzy01", 0x3130_797A_7A79_7871); @@ -201,7 +201,10 @@ fn atom_macro() { assert_eq!(test_atom!("a"), Atom::from("a")); assert_eq!(test_atom!("body"), Atom::from("body")); assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); } #[test] @@ -300,7 +303,7 @@ fn test_from_string() { #[test] fn test_try_static() { assert!(Atom::try_static("defaults").is_some()); - assert!(Atom::try_static("head").is_some()); + assert!(Atom::try_static("head").is_none()); assert!(Atom::try_static("not in the static table").is_none()); } diff --git a/src/atom.rs b/src/atom.rs index d1bd7b8..7a3dea9 100644 --- a/src/atom.rs +++ b/src/atom.rs @@ -99,6 +99,25 @@ impl Atom { } } + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; + } + + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } + fn tag(&self) -> u8 { (self.unsafe_data.get() & TAG_MASK) as u8 } @@ -186,20 +205,22 @@ impl Hash for Atom { impl<'a, Static: StaticAtomSet> From> for Atom { fn from(string_to_add: Cow<'a, str>) -> Self { - Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { - let len = string_to_add.len(); - if len <= MAX_INLINE_LEN { - let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); - { - let dest = inline_atom_slice_mut(&mut data); - dest[..len].copy_from_slice(string_to_add.as_bytes()) - } - Atom { - // INLINE_TAG ensures this is never zero - unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, - phantom: PhantomData, - } - } else { + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { let ptr: std::ptr::NonNull = DYNAMIC_SET.insert(string_to_add, hash.g); let data = ptr.as_ptr() as u64; debug_assert!(0 == data & TAG_MASK); @@ -208,8 +229,8 @@ impl<'a, Static: StaticAtomSet> From> for Atom { unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, phantom: PhantomData, } - } - }) + }) + } } } diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs index 0fe4819..3228946 100644 --- a/string-cache-codegen/lib.rs +++ b/string-cache-codegen/lib.rs @@ -187,11 +187,19 @@ impl AtomType { // which would cause divisions by zero in rust-phf. self.atoms.insert(String::new()); - let atoms: Vec<&str> = self.atoms.iter().map(|s| &**s).collect(); - let hash_state = phf_generator::generate_hash(&atoms); + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); let phf_generator::HashState { key, disps, map } = hash_state; let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); - let atoms: Vec<&str> = map.iter().map(|&idx| atoms[idx]).collect(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; let indices = 0..atoms.len() as u32; @@ -228,16 +236,33 @@ impl AtomType { let macro_name = new_term(&*self.macro_name); let module = module.parse::().unwrap(); let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); - let const_names: Vec<_> = atoms + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs .iter() - .map(|atom| { - let mut name = atom_prefix.clone(); - for c in atom.chars() { - name.push_str(&format!("_{:02X}", c as u32)) + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); } - new_term(&name) + + let len = s.len() as u8; + + (const_name, (value, len)) }) - .collect(); + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); quote! { #atom_doc @@ -265,6 +290,9 @@ impl AtomType { #( pub const #const_names: #type_name = #type_name::pack_static(#indices); )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* #macro_doc #[macro_export] @@ -272,6 +300,9 @@ impl AtomType { #( (#atoms) => { #module::#const_names }; )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* } } }