Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move properties data over to an (extensible) enum #2140

Merged
merged 4 commits into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions components/properties/src/maps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ use crate::*;
use core::marker::PhantomData;
use icu_codepointtrie::{CodePointTrie, TrieValue};
use icu_provider::prelude::*;
use zerofrom::ZeroFrom;

/// A wrapper around code point set data, returned by property getters for
/// unicode sets.
Expand Down Expand Up @@ -83,7 +82,7 @@ impl<T: TrieValue> CodePointMapData<T> {
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get(&self, ch: char) -> T {
self.data.get().code_point_trie.get(ch as u32)
self.data.get().get_u32(ch as u32)
}

/// Get the value this map has associated with code point `ch`
Expand All @@ -103,7 +102,7 @@ impl<T: TrieValue> CodePointMapData<T> {
/// assert_eq!(gc.get_u32(0x1F383), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get_u32(&self, ch: u32) -> T {
self.data.get().code_point_trie.get(ch)
self.data.get().get_u32(ch)
}

/// Get a [`CodePointSetData`] for all elements corresponding to a particular value
Expand All @@ -126,7 +125,7 @@ impl<T: TrieValue> CodePointMapData<T> {
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get_set_for_value(&self, value: T) -> CodePointSetData {
let set = self.data.get().code_point_trie.get_set_for_value(value);
let set = self.data.get().get_set_for_value(value);
CodePointSetData::from_unicode_set(set)
}

Expand All @@ -142,9 +141,7 @@ impl<T: TrieValue> CodePointMapData<T> {

/// Construct a new one an owned [`CodePointTrie`]
pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
let set = UnicodePropertyMapV1 {
code_point_trie: trie,
};
let set = UnicodePropertyMapV1::from_code_point_trie(trie);
CodePointMapData::from_data(DataPayload::<ErasedMaplikeMarker<T>>::from_owned(set))
}
/// Convert this type to a [`CodePointTrie`], borrowing if possible,
Expand All @@ -158,7 +155,7 @@ impl<T: TrieValue> CodePointMapData<T> {
/// in the data, however exceptions can be made if the performance hit is considered to
/// be okay.
pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
ZeroFrom::zero_from(&self.data.get().code_point_trie)
self.data.get().to_code_point_trie()
}
}

Expand Down Expand Up @@ -189,7 +186,7 @@ impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get(&self, ch: char) -> T {
self.map.code_point_trie.get(ch as u32)
self.map.get_u32(ch as u32)
}

/// Get the value this map has associated with code point `ch`
Expand All @@ -211,7 +208,7 @@ impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// assert_eq!(gc.get_u32(0x1F383), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get_u32(&self, ch: u32) -> T {
self.map.code_point_trie.get(ch)
self.map.get_u32(ch)
}

/// Get a [`CodePointSetData`] for all elements corresponding to a particular value
Expand All @@ -235,7 +232,7 @@ impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get_set_for_value(&self, value: T) -> CodePointSetData {
let set = self.map.code_point_trie.get_set_for_value(value);
let set = self.map.get_set_for_value(value);
CodePointSetData::from_unicode_set(set)
}
}
Expand Down
82 changes: 76 additions & 6 deletions components/properties/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,46 @@ use crate::script::ScriptWithExtensions;
use icu_codepointtrie::{CodePointTrie, TrieValue};
use icu_provider::prelude::*;
use icu_uniset::UnicodeSet;
use zerofrom::ZeroFrom;

/// A set of characters with a particular property.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_properties::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct UnicodePropertyV1<'data> {
#[non_exhaustive]
pub enum UnicodePropertyV1<'data> {
/// The set of characters, represented as an inversion list
#[cfg_attr(feature = "serde", serde(borrow))]
pub inv_list: UnicodeSet<'data>,
InversionList(#[cfg_attr(feature = "serde", serde(borrow))] UnicodeSet<'data>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}

/// A map efficiently storing data about individual characters.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
#[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_properties::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct UnicodePropertyMapV1<'data, T: TrieValue> {
#[non_exhaustive]
pub enum UnicodePropertyMapV1<'data, T: TrieValue> {
/// A codepoint trie storing the data
#[cfg_attr(feature = "serde", serde(borrow))]
pub code_point_trie: CodePointTrie<'data, T>,
CodePointTrie(#[cfg_attr(feature = "serde", serde(borrow))] CodePointTrie<'data, T>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}

/// A data structure efficiently storing `Script` and `Script_Extensions` property data.
Expand All @@ -57,6 +70,63 @@ pub struct ScriptWithExtensionsPropertyV1<'data> {
pub data: ScriptWithExtensions<'data>,
}

// See CodePointSetData for documentation of these functions
impl<'data> UnicodePropertyV1<'data> {
#[inline]
pub(crate) fn contains(&self, ch: char) -> bool {
match *self {
Self::InversionList(ref l) => l.contains(ch),
}
}
#[inline]
pub(crate) fn contains_u32(&self, ch: u32) -> bool {
match *self {
Self::InversionList(ref l) => l.contains_u32(ch),
}
}

#[inline]
pub(crate) fn from_unicode_set(l: UnicodeSet<'static>) -> Self {
Self::InversionList(l)
}

#[inline]
pub(crate) fn to_unicode_set(&'_ self) -> UnicodeSet<'_> {
match *self {
Self::InversionList(ref l) => ZeroFrom::zero_from(l),
}
}
}

// See CodePointMapData for documentation of these functions
impl<'data, T: TrieValue> UnicodePropertyMapV1<'data, T> {
#[inline]
pub(crate) fn get_u32(&self, ch: u32) -> T {
match *self {
Self::CodePointTrie(ref t) => t.get(ch),
}
}

#[inline]
pub(crate) fn get_set_for_value(&self, value: T) -> UnicodeSet<'static> {
match *self {
Self::CodePointTrie(ref t) => t.get_set_for_value(value),
}
}

#[inline]
pub(crate) fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
Self::CodePointTrie(trie)
}

#[inline]
pub(crate) fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
match *self {
Self::CodePointTrie(ref t) => ZeroFrom::zero_from(t),
}
}
}

macro_rules! expand {
(
($(($bin_marker:ident, $bin_s:literal),)+),
Expand Down
13 changes: 6 additions & 7 deletions components/properties/src/sets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ use crate::*;
use core::iter::FromIterator;
use icu_provider::prelude::*;
use icu_uniset::UnicodeSet;
use zerofrom::ZeroFrom;

/// A wrapper around code point set data, returned by property getters for
/// unicode sets.
Expand Down Expand Up @@ -57,7 +56,7 @@ impl CodePointSetData {
/// ```
#[inline]
pub fn contains(&self, ch: char) -> bool {
self.data.get().inv_list.contains(ch)
self.data.get().contains(ch)
}

/// Check if the set contains a character as a UTF32 code unit
Expand All @@ -78,7 +77,7 @@ impl CodePointSetData {
/// ```
#[inline]
pub fn contains_u32(&self, ch: u32) -> bool {
self.data.get().inv_list.contains_u32(ch)
self.data.get().contains_u32(ch)
}

/// Construct a borrowed version of this type that can be queried
Expand Down Expand Up @@ -119,7 +118,7 @@ impl CodePointSetData {

/// Construct a new one an owned [`UnicodeSet`]
pub fn from_unicode_set(set: UnicodeSet<'static>) -> Self {
let set = UnicodePropertyV1 { inv_list: set };
let set = UnicodePropertyV1::from_unicode_set(set);
CodePointSetData::from_data(DataPayload::<ErasedSetlikeMarker>::from_owned(set))
}

Expand All @@ -134,7 +133,7 @@ impl CodePointSetData {
/// in the data, however exceptions can be made if the performance hit is considered to
/// be okay.
pub fn to_unicode_set(&self) -> UnicodeSet<'_> {
ZeroFrom::zero_from(&self.data.get().inv_list)
self.data.get().to_unicode_set()
}
}

Expand Down Expand Up @@ -164,7 +163,7 @@ impl<'a> CodePointSetDataBorrowed<'a> {
/// ```
#[inline]
pub fn contains(&self, ch: char) -> bool {
self.set.inv_list.contains(ch)
self.set.contains(ch)
}

/// Check if the set contains a character as a UTF32 code unit
Expand All @@ -183,7 +182,7 @@ impl<'a> CodePointSetDataBorrowed<'a> {
/// ```
#[inline]
pub fn contains_u32(&self, ch: u32) -> bool {
self.set.inv_list.contains_u32(ch)
self.set.contains_u32(ch)
}
}

Expand Down
8 changes: 6 additions & 2 deletions provider/datagen/src/transform/uprops/bin_uniset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ macro_rules! expand {
Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(
UnicodePropertyV1 { inv_list },
UnicodePropertyV1::InversionList(inv_list),
)),
})
}
Expand Down Expand Up @@ -142,6 +142,7 @@ expand!(

#[test]
fn test_basic() {
use icu_properties::provider::UnicodePropertyV1;
use icu_properties::provider::WhiteSpaceV1Marker;
use icu_uniset::UnicodeSet;

Expand All @@ -152,7 +153,10 @@ fn test_basic() {
.and_then(DataResponse::take_payload)
.expect("Loading was successful");

let whitespace: &UnicodeSet = &payload.get().inv_list;
let whitespace: &UnicodeSet = match payload.get() {
UnicodePropertyV1::InversionList(ref l) => l,
_ => unreachable!("Should have serialized to an inversion list"),
};

assert!(whitespace.contains(' '));
assert!(whitespace.contains('\n'));
Expand Down
15 changes: 10 additions & 5 deletions provider/datagen/src/transform/uprops/enum_codepointtrie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ macro_rules! expand {
let code_point_trie = CodePointTrie::try_from(source_cpt_data).map_err(|e| {
DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e)
})?;
let data_struct = UnicodePropertyMapV1 { code_point_trie };
let data_struct = UnicodePropertyMapV1::CodePointTrie(code_point_trie);
Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(data_struct)),
Expand Down Expand Up @@ -84,7 +84,7 @@ expand!(
mod tests {
use super::*;
use icu_codepointtrie::CodePointTrie;
use icu_properties::provider::{GeneralCategoryV1Marker, ScriptV1Marker};
use icu_properties::provider::{GeneralCategoryV1Marker, ScriptV1Marker, UnicodePropertyMapV1};
use icu_properties::{GeneralCategory, Script};

// A test of the UnicodeProperty General_Category is truly a test of the
Expand All @@ -100,7 +100,10 @@ mod tests {
.and_then(DataResponse::take_payload)
.expect("Loading was successful");

let trie: &CodePointTrie<GeneralCategory> = &payload.get().code_point_trie;
let trie: &CodePointTrie<GeneralCategory> = match payload.get() {
UnicodePropertyMapV1::CodePointTrie(ref t) => t,
_ => unreachable!("Should have serialized to a code point trie"),
};

assert_eq!(trie.get('꣓' as u32), GeneralCategory::DecimalNumber);
assert_eq!(trie.get('≈' as u32), GeneralCategory::MathSymbol);
Expand All @@ -115,8 +118,10 @@ mod tests {
.and_then(DataResponse::take_payload)
.expect("Loading was successful");

let trie: &CodePointTrie<Script> = &payload.get().code_point_trie;

let trie: &CodePointTrie<Script> = match payload.get() {
UnicodePropertyMapV1::CodePointTrie(ref t) => t,
_ => unreachable!("Should have serialized to a code point trie"),
};
assert_eq!(trie.get('꣓' as u32), Script::Saurashtra);
assert_eq!(trie.get('≈' as u32), Script::Common);
}
Expand Down
26 changes: 12 additions & 14 deletions provider/testdata/data/baked/props/ahex_v1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,15 @@ impl ResourceProvider<icu_properties::provider::AsciiHexDigitV1Marker>
}
type DataStruct =
&'static <icu_properties::provider::AsciiHexDigitV1Marker as DataMarker>::Yokeable;
static UND: DataStruct = &::icu_properties::provider::UnicodePropertyV1 {
inv_list: unsafe {
#[allow(unused_unsafe)]
::icu_uniset::UnicodeSet::from_parts_unchecked(
unsafe {
::zerovec::ZeroVec::from_bytes_unchecked(&[
48u8, 0u8, 0u8, 0u8, 58u8, 0u8, 0u8, 0u8, 65u8, 0u8, 0u8, 0u8, 71u8, 0u8, 0u8,
0u8, 97u8, 0u8, 0u8, 0u8, 103u8, 0u8, 0u8, 0u8,
])
},
22usize,
)
},
};
static UND: DataStruct = &::icu_properties::provider::UnicodePropertyV1::InversionList(unsafe {
#[allow(unused_unsafe)]
::icu_uniset::UnicodeSet::from_parts_unchecked(
unsafe {
::zerovec::ZeroVec::from_bytes_unchecked(&[
48u8, 0u8, 0u8, 0u8, 58u8, 0u8, 0u8, 0u8, 65u8, 0u8, 0u8, 0u8, 71u8, 0u8, 0u8, 0u8,
97u8, 0u8, 0u8, 0u8, 103u8, 0u8, 0u8, 0u8,
])
},
22usize,
)
});
Loading