Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return sets for enumerated property value data using CPT data #1608

Merged
merged 16 commits into from
Mar 8, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 4 additions & 5 deletions components/icu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ pub mod properties {
//! ## Property data as `UnicodeSet`s
//!
//! ```
//! use icu::properties::{sets, GeneralCategory};
//! use icu::properties::{maps, sets, GeneralCategory};
//!
//! let provider = icu_testdata::get_provider();
//!
Expand All @@ -399,11 +399,10 @@ pub mod properties {
//!
//! // An individual enumerated property value as a `UnicodeSet`
//!
//! let payload =
//! sets::get_for_general_category(&provider, GeneralCategory::LineSeparator)
//! .expect("The data should be valid");
//! let payload = maps::get_general_category(&provider).expect("The data should be valid");
//! let data_struct = payload.get();
//! let line_sep = &data_struct.inv_list;
//! let gc = &data_struct.code_point_trie;
//! let line_sep = gc.get_set_for_value(GeneralCategory::LineSeparator);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this belongs in this patch or somewhere else, but the API we expose for this should take GeneralCategoryGroup, not GeneralCategory, to make it possible for people to ask for gc=Letter and so on.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method get_set_for_value() is implemented on a CodePointTrie, so it exists for all enumerated properties.

The special method that exists for the GeneralCategoryGroup case is over in icu_properties::sets called get_for_general_category_group().

//!
//! assert!(line_sep.contains_u32(0x2028));
//! assert!(!line_sep.contains_u32(0x2029));
Expand Down
10 changes: 5 additions & 5 deletions components/properties/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ APIs that return a [`CodePointTrie`] exist for certain enumerated properties. Se
### Property data as `UnicodeSet`s

```rust
use icu::properties::{sets, GeneralCategory};
use icu::properties::{maps, sets, GeneralCategory};

let provider = icu_testdata::get_provider();

Expand All @@ -33,11 +33,11 @@ assert!(!emoji.contains('木')); // U+6728

// An individual enumerated property value as a `UnicodeSet`

let payload =
sets::get_for_general_category(&provider, GeneralCategory::LineSeparator)
.expect("The data should be valid");
let payload = maps::get_general_category(&provider)
.expect("The data should be valid");
let data_struct = payload.get();
let line_sep = &data_struct.inv_list;
let gc = &data_struct.code_point_trie;
let line_sep = gc.get_set_for_value(GeneralCategory::LineSeparator);

assert!(line_sep.contains_u32(0x2028));
assert!(!line_sep.contains_u32(0x2029));
Expand Down
10 changes: 5 additions & 5 deletions components/properties/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
//! ## Property data as `UnicodeSet`s
//!
//! ```
//! use icu::properties::{sets, GeneralCategory};
//! use icu::properties::{maps, sets, GeneralCategory};
//!
//! let provider = icu_testdata::get_provider();
//!
Expand All @@ -35,11 +35,11 @@
//!
//! // An individual enumerated property value as a `UnicodeSet`
//!
//! let payload =
//! sets::get_for_general_category(&provider, GeneralCategory::LineSeparator)
//! .expect("The data should be valid");
//! let payload = maps::get_general_category(&provider)
//! .expect("The data should be valid");
//! let data_struct = payload.get();
//! let line_sep = &data_struct.inv_list;
//! let gc = &data_struct.code_point_trie;
//! let line_sep = gc.get_set_for_value(GeneralCategory::LineSeparator);
//!
//! assert!(line_sep.contains_u32(0x2028));
//! assert!(!line_sep.contains_u32(0x2029));
Expand Down
211 changes: 1 addition & 210 deletions components/properties/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub mod key {
};
}

define_resource_keys!(ALL_SET_KEYS; 265;
define_resource_keys!(ALL_SET_KEYS; 65;
//
// Binary property UnicodeSets
//
Expand Down Expand Up @@ -104,215 +104,6 @@ pub mod key {
(XDIGIT_V1, "xdigit"),
(XID_CONTINUE_V1, "XIDC"),
(XID_START_V1, "XIDS"),

//
// Enumerated property prop=val UnicodeSets
//

// Note: The ResourceKey subcategory strings are determined from the
// short name of the enumerated property and the short name of the
// property value.

(GENERAL_CATEGORY_OTHER_V1, "gc=C"),
(GENERAL_CATEGORY_CONTROL_V1, "gc=Cc"),
(GENERAL_CATEGORY_FORMAT_V1, "gc=Cf"),
(GENERAL_CATEGORY_UNASSIGNED_V1, "gc=Cn"),
(GENERAL_CATEGORY_PRIVATE_USE_V1, "gc=Co"),
(GENERAL_CATEGORY_SURROGATE_V1, "gc=Cs"),
(GENERAL_CATEGORY_LETTER_V1, "gc=L"),
(GENERAL_CATEGORY_CASED_LETTER_V1, "gc=LC"),
(GENERAL_CATEGORY_LOWERCASE_LETTER_V1, "gc=Ll"),
(GENERAL_CATEGORY_MODIFIER_LETTER_V1, "gc=Lm"),
(GENERAL_CATEGORY_OTHER_LETTER_V1, "gc=Lo"),
(GENERAL_CATEGORY_TITLECASE_LETTER_V1, "gc=Lt"),
(GENERAL_CATEGORY_UPPERCASE_LETTER_V1, "gc=Lu"),
(GENERAL_CATEGORY_MARK_V1, "gc=M"),
(GENERAL_CATEGORY_SPACING_MARK_V1, "gc=Mc"),
(GENERAL_CATEGORY_ENCLOSING_MARK_V1, "gc=Me"),
(GENERAL_CATEGORY_NONSPACING_MARK_V1, "gc=Mn"),
(GENERAL_CATEGORY_NUMBER_V1, "gc=N"),
(GENERAL_CATEGORY_DIGIT_V1, "gc=Nd"),
(GENERAL_CATEGORY_LETTER_NUMBER_V1, "gc=Nl"),
(GENERAL_CATEGORY_OTHER_NUMBER_V1, "gc=No"),
(GENERAL_CATEGORY_PUNCTUATION_V1, "gc=P"),
(GENERAL_CATEGORY_CONNECTOR_PUNCTUATION_V1, "gc=Pc"),
(GENERAL_CATEGORY_DASH_PUNCTUATION_V1, "gc=Pd"),
(GENERAL_CATEGORY_CLOSE_PUNCTUATION_V1, "gc=Pe"),
(GENERAL_CATEGORY_FINAL_PUNCTUATION_V1, "gc=Pf"),
(GENERAL_CATEGORY_INITIAL_PUNCTUATION_V1, "gc=Pi"),
(GENERAL_CATEGORY_OTHER_PUNCTUATION_V1, "gc=Po"),
(GENERAL_CATEGORY_OPEN_PUNCTUATION_V1, "gc=Ps"),
(GENERAL_CATEGORY_SYMBOL_V1, "gc=S"),
(GENERAL_CATEGORY_CURRENCY_SYMBOL_V1, "gc=Sc"),
(GENERAL_CATEGORY_MODIFIER_SYMBOL_V1, "gc=Sk"),
(GENERAL_CATEGORY_MATH_SYMBOL_V1, "gc=Sm"),
(GENERAL_CATEGORY_OTHER_SYMBOL_V1, "gc=So"),
(GENERAL_CATEGORY_SEPARATOR_V1, "gc=Z"),
(GENERAL_CATEGORY_LINE_SEPARATOR_V1, "gc=Zl"),
(GENERAL_CATEGORY_PARAGRAPH_SEPARATOR_V1, "gc=Zp"),
(GENERAL_CATEGORY_SPACE_SEPARATOR_V1, "gc=Zs"),
(SCRIPT_ADLAM_V1, "sc=Adlm"),
(SCRIPT_AHOM_V1, "sc=Ahom"),
(SCRIPT_ANATOLIAN_HIEROGLYPHS_V1, "sc=Hluw"),
(SCRIPT_ARABIC_V1, "sc=Arab"),
(SCRIPT_ARMENIAN_V1, "sc=Armn"),
(SCRIPT_AVESTAN_V1, "sc=Avst"),
(SCRIPT_BALINESE_V1, "sc=Bali"),
(SCRIPT_BAMUM_V1, "sc=Bamu"),
(SCRIPT_BASSA_VAH_V1, "sc=Bass"),
(SCRIPT_BATAK_V1, "sc=Batk"),
(SCRIPT_BENGALI_V1, "sc=Beng"),
(SCRIPT_BHAIKSUKI_V1, "sc=Bhks"),
(SCRIPT_BOPOMOFO_V1, "sc=Bopo"),
(SCRIPT_BRAHMI_V1, "sc=Brah"),
(SCRIPT_BRAILLE_V1, "sc=Brai"),
(SCRIPT_BUGINESE_V1, "sc=Bugi"),
(SCRIPT_BUHID_V1, "sc=Buhd"),
(SCRIPT_CANADIAN_ABORIGINAL_V1, "sc=Cans"),
(SCRIPT_CARIAN_V1, "sc=Cari"),
(SCRIPT_CAUCASIAN_ALBANIAN_V1, "sc=Aghb"),
(SCRIPT_CHAKMA_V1, "sc=Cakm"),
(SCRIPT_CHAM_V1, "sc=Cham"),
(SCRIPT_CHEROKEE_V1, "sc=Cher"),
(SCRIPT_CHORASMIAN_V1, "sc=Chrs"),
(SCRIPT_COMMON_V1, "sc=Zyyy"),
(SCRIPT_COPTIC_V1, "sc=Copt"),
(SCRIPT_CUNEIFORM_V1, "sc=Xsux"),
(SCRIPT_CYPRIOT_V1, "sc=Cprt"),
(SCRIPT_CYPRO_MINOAN_V1, "sc=Cpmn"),
(SCRIPT_CYRILLIC_V1, "sc=Cyrl"),
(SCRIPT_DESERET_V1, "sc=Dsrt"),
(SCRIPT_DEVANAGARI_V1, "sc=Deva"),
(SCRIPT_DIVES_AKURU_V1, "sc=Diak"),
(SCRIPT_DOGRA_V1, "sc=Dogr"),
(SCRIPT_DUPLOYAN_V1, "sc=Dupl"),
(SCRIPT_EGYPTIAN_HIEROGLYPHS_V1, "sc=Egyp"),
(SCRIPT_ELBASAN_V1, "sc=Elba"),
(SCRIPT_ELYMAIC_V1, "sc=Elym"),
(SCRIPT_ETHIOPIC_V1, "sc=Ethi"),
(SCRIPT_GEORGIAN_V1, "sc=Geor"),
(SCRIPT_GLAGOLITIC_V1, "sc=Glag"),
(SCRIPT_GOTHIC_V1, "sc=Goth"),
(SCRIPT_GRANTHA_V1, "sc=Gran"),
(SCRIPT_GREEK_V1, "sc=Grek"),
(SCRIPT_GUJARATI_V1, "sc=Gujr"),
(SCRIPT_GUNJALA_GONDI_V1, "sc=Gong"),
(SCRIPT_GURMUKHI_V1, "sc=Guru"),
(SCRIPT_HAN_V1, "sc=Hani"),
(SCRIPT_HANGUL_V1, "sc=Hang"),
(SCRIPT_HANIFI_ROHINGYA_V1, "sc=Rohg"),
(SCRIPT_HANUNOO_V1, "sc=Hano"),
(SCRIPT_HATRAN_V1, "sc=Hatr"),
(SCRIPT_HEBREW_V1, "sc=Hebr"),
(SCRIPT_HIRAGANA_V1, "sc=Hira"),
(SCRIPT_IMPERIAL_ARAMAIC_V1, "sc=Armi"),
(SCRIPT_INHERITED_V1, "sc=Zinh"),
(SCRIPT_INSCRIPTIONAL_PAHLAVI_V1, "sc=Phli"),
(SCRIPT_INSCRIPTIONAL_PARTHIAN_V1, "sc=Prti"),
(SCRIPT_JAVANESE_V1, "sc=Java"),
(SCRIPT_KAITHI_V1, "sc=Kthi"),
(SCRIPT_KANNADA_V1, "sc=Knda"),
(SCRIPT_KATAKANA_V1, "sc=Kana"),
(SCRIPT_KAYAH_LI_V1, "sc=Kali"),
(SCRIPT_KHAROSHTHI_V1, "sc=Khar"),
(SCRIPT_KHITAN_SMALL_SCRIPT_V1, "sc=Kits"),
(SCRIPT_KHMER_V1, "sc=Khmr"),
(SCRIPT_KHOJKI_V1, "sc=Khoj"),
(SCRIPT_KHUDAWADI_V1, "sc=Sind"),
(SCRIPT_LAO_V1, "sc=Laoo"),
(SCRIPT_LATIN_V1, "sc=Latn"),
(SCRIPT_LEPCHA_V1, "sc=Lepc"),
(SCRIPT_LIMBU_V1, "sc=Limb"),
(SCRIPT_LINEAR_A_V1, "sc=Lina"),
(SCRIPT_LINEAR_B_V1, "sc=Linb"),
(SCRIPT_LISU_V1, "sc=Lisu"),
(SCRIPT_LYCIAN_V1, "sc=Lyci"),
(SCRIPT_LYDIAN_V1, "sc=Lydi"),
(SCRIPT_MAHAJANI_V1, "sc=Mahj"),
(SCRIPT_MAKASAR_V1, "sc=Maka"),
(SCRIPT_MALAYALAM_V1, "sc=Mlym"),
(SCRIPT_MANDAIC_V1, "sc=Mand"),
(SCRIPT_MANICHAEAN_V1, "sc=Mani"),
(SCRIPT_MARCHEN_V1, "sc=Marc"),
(SCRIPT_MASARAM_GONDI_V1, "sc=Gonm"),
(SCRIPT_MEDEFAIDRIN_V1, "sc=Medf"),
(SCRIPT_MEETEI_MAYEK_V1, "sc=Mtei"),
(SCRIPT_MENDE_KIKAKUI_V1, "sc=Mend"),
(SCRIPT_MEROITIC_CURSIVE_V1, "sc=Merc"),
(SCRIPT_MEROITIC_HIEROGLYPHS_V1, "sc=Mero"),
(SCRIPT_MIAO_V1, "sc=Plrd"),
(SCRIPT_MODI_V1, "sc=Modi"),
(SCRIPT_MONGOLIAN_V1, "sc=Mong"),
(SCRIPT_MRO_V1, "sc=Mroo"),
(SCRIPT_MULTANI_V1, "sc=Mult"),
(SCRIPT_MYANMAR_V1, "sc=Mymr"),
(SCRIPT_NABATAEAN_V1, "sc=Nbat"),
(SCRIPT_NANDINAGARI_V1, "sc=Nand"),
(SCRIPT_NEW_TAI_LUE_V1, "sc=Talu"),
(SCRIPT_NEWA_V1, "sc=Newa"),
(SCRIPT_NKO_V1, "sc=Nkoo"),
(SCRIPT_NUSHU_V1, "sc=Nshu"),
(SCRIPT_NYIAKENG_PUACHUE_HMONG_V1, "sc=Hmnp"),
(SCRIPT_OGHAM_V1, "sc=Ogam"),
(SCRIPT_OL_CHIKI_V1, "sc=Olck"),
(SCRIPT_OLD_HUNGARIAN_V1, "sc=Hung"),
(SCRIPT_OLD_ITALIC_V1, "sc=Ital"),
(SCRIPT_OLD_NORTH_ARABIAN_V1, "sc=Narb"),
(SCRIPT_OLD_PERMIC_V1, "sc=Perm"),
(SCRIPT_OLD_PERSIAN_V1, "sc=Xpeo"),
(SCRIPT_OLD_SOGDIAN_V1, "sc=Sogo"),
(SCRIPT_OLD_SOUTH_ARABIAN_V1, "sc=Sarb"),
(SCRIPT_OLD_TURKIC_V1, "sc=Orkh"),
(SCRIPT_OLD_UYGHUR_V1, "sc=Ougr"),
(SCRIPT_ORIYA_V1, "sc=Orya"),
(SCRIPT_OSAGE_V1, "sc=Osge"),
(SCRIPT_OSMANYA_V1, "sc=Osma"),
(SCRIPT_PAHAWH_HMONG_V1, "sc=Hmng"),
(SCRIPT_PALMYRENE_V1, "sc=Palm"),
(SCRIPT_PAU_CIN_HAU_V1, "sc=Pauc"),
(SCRIPT_PHAGS_PA_V1, "sc=Phag"),
(SCRIPT_PHOENICIAN_V1, "sc=Phnx"),
(SCRIPT_PSALTER_PAHLAVI_V1, "sc=Phlp"),
(SCRIPT_REJANG_V1, "sc=Rjng"),
(SCRIPT_RUNIC_V1, "sc=Runr"),
(SCRIPT_SAMARITAN_V1, "sc=Samr"),
(SCRIPT_SAURASHTRA_V1, "sc=Saur"),
(SCRIPT_SHARADA_V1, "sc=Shrd"),
(SCRIPT_SHAVIAN_V1, "sc=Shaw"),
(SCRIPT_SIDDHAM_V1, "sc=Sidd"),
(SCRIPT_SIGNWRITING_V1, "sc=Sgnw"),
(SCRIPT_SINHALA_V1, "sc=Sinh"),
(SCRIPT_SOGDIAN_V1, "sc=Sogd"),
(SCRIPT_SORA_SOMPENG_V1, "sc=Sora"),
(SCRIPT_SOYOMBO_V1, "sc=Soyo"),
(SCRIPT_SUNDANESE_V1, "sc=Sund"),
(SCRIPT_SYLOTI_NAGRI_V1, "sc=Sylo"),
(SCRIPT_SYRIAC_V1, "sc=Syrc"),
(SCRIPT_TAGALOG_V1, "sc=Tglg"),
(SCRIPT_TAGBANWA_V1, "sc=Tagb"),
(SCRIPT_TAI_LE_V1, "sc=Tale"),
(SCRIPT_TAI_THAM_V1, "sc=Lana"),
(SCRIPT_TAI_VIET_V1, "sc=Tavt"),
(SCRIPT_TAKRI_V1, "sc=Takr"),
(SCRIPT_TAMIL_V1, "sc=Taml"),
(SCRIPT_TANGSA_V1, "sc=Tnsa"),
(SCRIPT_TANGUT_V1, "sc=Tang"),
(SCRIPT_TELUGU_V1, "sc=Telu"),
(SCRIPT_THAANA_V1, "sc=Thaa"),
(SCRIPT_THAI_V1, "sc=Thai"),
(SCRIPT_TIBETAN_V1, "sc=Tibt"),
(SCRIPT_TIFINAGH_V1, "sc=Tfng"),
(SCRIPT_TIRHUTA_V1, "sc=Tirh"),
(SCRIPT_TOTO_V1, "sc=Toto"),
(SCRIPT_UGARITIC_V1, "sc=Ugar"),
(SCRIPT_UNKNOWN_V1, "sc=Zzzz"),
(SCRIPT_VAI_V1, "sc=Vaii"),
(SCRIPT_VITHKUQI_V1, "sc=Vith"),
(SCRIPT_WANCHO_V1, "sc=Wcho"),
(SCRIPT_WARANG_CITI_V1, "sc=Wara"),
(SCRIPT_YEZIDI_V1, "sc=Yezi"),
(SCRIPT_YI_V1, "sc=Yiii"),
(SCRIPT_ZANABAZAR_SQUARE_V1, "sc=Zanb"),
);

define_resource_keys!(ALL_MAP_KEYS; 8;
Expand Down
Loading