diff --git a/src/app.rs b/src/app.rs index ca00583..386870b 100644 --- a/src/app.rs +++ b/src/app.rs @@ -57,6 +57,11 @@ script-extension produces one table of Unicode codepoint ranges for each possible Script_Extension value. "; +const ABOUT_JOINING_TYPE: &'static str = "\ +joining-type produces one table of Unicode codepoint ranges for each +possible Joining_Type value. +"; + const ABOUT_AGE: &'static str = "\ age produces a table for each discrete Unicode age. Each table includes the codepoints that were added for that age. Tables can be emitted as a sorted @@ -345,6 +350,25 @@ pub fn app() -> App<'static, 'static> { "List the properties that can be generated with this \ command.", )); + let cmd_joining_type = + SubCommand::with_name("joining-type") + .author(crate_authors!()) + .version(crate_version!()) + .template(TEMPLATE_SUB) + .about("Create the Joining_Type property tables.") + .before_help(ABOUT_JOINING_TYPE) + .arg(ucd_dir.clone()) + .arg(flag_fst_dir.clone()) + .arg(flag_name("JOINING_TYPE")) + .arg(flag_chars.clone()) + .arg(flag_trie_set.clone()) + .arg(Arg::with_name("enum").long("enum").help( + "Emit a single table that maps codepoints to joining type.", + )) + .arg(Arg::with_name("rust-enum").long("rust-enum").help( + "Emit a Rust enum and a table that maps codepoints to \ + joining type.", + )); let cmd_prop_bool = SubCommand::with_name("property-bool") .author(crate_authors!()) .version(crate_version!()) @@ -612,6 +636,7 @@ pub fn app() -> App<'static, 'static> { .subcommand(cmd_general_category) .subcommand(cmd_script) .subcommand(cmd_script_extension) + .subcommand(cmd_joining_type) .subcommand(cmd_age) .subcommand(cmd_bidi_mirroring_glyph) .subcommand(cmd_prop_bool) diff --git a/src/general_category.rs b/src/general_category.rs index d3334e6..2e7fe00 100644 --- a/src/general_category.rs +++ b/src/general_category.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, BTreeSet}; -use ucd_parse::{self, UnicodeDataExpander}; +use ucd_parse::{self, UnicodeData, UnicodeDataExpander}; use args::ArgMatches; use error::Result; @@ -18,29 +18,8 @@ pub fn command(args: ArgMatches) -> Result<()> { return print_property_values(&propvals, "General_Category"); } - // Expand all of our UnicodeData rows. This results in one big list of - // all assigned codepoints. - let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect(); + let mut bycat = expand_into_categories(unexpanded, &propvals)?; - // Collect each general category into an ordered set. - let mut bycat: BTreeMap> = BTreeMap::new(); - let mut assigned = BTreeSet::new(); - for row in rows { - assigned.insert(row.codepoint.value()); - let gc = propvals.canonical("gc", &row.general_category)?.to_string(); - bycat - .entry(gc) - .or_insert(BTreeSet::new()) - .insert(row.codepoint.value()); - } - // As a special case, collect all unassigned codepoints. - let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string(); - bycat.insert(unassigned_name.clone(), BTreeSet::new()); - for cp in 0..=0x10FFFF { - if !assigned.contains(&cp) { - bycat.get_mut(&unassigned_name).unwrap().insert(cp); - } - } // As another special case, collect all "related" groups of categories. // But don't do this when printing an enumeration, because in an // enumeration each codepoint should belong to exactly one category, which @@ -74,6 +53,38 @@ pub fn command(args: ArgMatches) -> Result<()> { Ok(()) } +/// Expand a list of UnicodeData rows and group by category. +pub fn expand_into_categories( + unexpanded: Vec, + propvals: &PropertyValues, +) -> Result>> { + // Expand all of our UnicodeData rows. This results in one big list of + // all assigned codepoints. + let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect(); + + // Collect each general category into an ordered set. + let mut bycat: BTreeMap> = BTreeMap::new(); + let mut assigned = BTreeSet::new(); + for row in rows { + assigned.insert(row.codepoint.value()); + let gc = propvals.canonical("gc", &row.general_category)?.to_string(); + bycat + .entry(gc) + .or_insert(BTreeSet::new()) + .insert(row.codepoint.value()); + } + // As a special case, collect all unassigned codepoints. + let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string(); + bycat.insert(unassigned_name.clone(), BTreeSet::new()); + for cp in 0..=0x10FFFF { + if !assigned.contains(&cp) { + bycat.get_mut(&unassigned_name).unwrap().insert(cp); + } + } + + Ok(bycat) +} + /// Related returns a set of sets of codepoints corresponding to the "related" /// groups of categories defined by Table 12 in UAX#44 S5.7.1. /// diff --git a/src/joining_type.rs b/src/joining_type.rs new file mode 100644 index 0000000..39eb006 --- /dev/null +++ b/src/joining_type.rs @@ -0,0 +1,73 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use ucd_parse::{self, ArabicShaping}; + +use args::ArgMatches; +use error::Result; +use general_category; +use util::PropertyValues; + +pub fn command(args: ArgMatches) -> Result<()> { + let dir = args.ucd_dir()?; + let propvals = PropertyValues::from_ucd_dir(&dir)?; + let rows: Vec = ucd_parse::parse(&dir)?; + let unexpanded_gc = ucd_parse::parse(&dir)?; + let gc = + general_category::expand_into_categories(unexpanded_gc, &propvals)?; + + // Collect each joining type into an ordered set. + let mut by_type: BTreeMap> = BTreeMap::new(); + let mut assigned = BTreeSet::new(); + for row in rows { + assigned.insert(row.codepoint.value()); + let jt = + propvals.canonical("jt", row.joining_type.as_str())?.to_string(); + by_type + .entry(jt) + .or_insert(BTreeSet::new()) + .insert(row.codepoint.value()); + } + // Process the codepoints that are not listed as per the note in + // ArabicShaping.txt: + // + // Note: Code points that are not explicitly listed in this file are either + // of joining type T or U: + // + // - Those that are not explicitly listed and that are of General Category + // Mn, Me, or Cf have joining type T. + // - All others not explicitly listed have joining type U. + let transparent_name = propvals.canonical("jt", "transparent")?; + let non_joining_name = propvals.canonical("jt", "non_joining")?; + let transparent_categories = ["Mn", "Me", "Cf"] + .iter() + .map(|cat| propvals.canonical("gc", cat).map(|name| &gc[&name])) + .collect::>>()?; + for cp in 0..=0x10FFFF { + if assigned.contains(&cp) { + continue; + } + // See if the code point is in any of the general categories that + // map to the Transparent joining type. Otherwise add to the + // Non_Joining type. + if transparent_categories.iter().any(|cat| cat.contains(&cp)) { + by_type.get_mut(&transparent_name).unwrap().insert(cp); + } else { + by_type.get_mut(&non_joining_name).unwrap().insert(cp); + } + } + + let mut wtr = args.writer("joining_type")?; + if args.is_present("enum") { + wtr.ranges_to_enum(args.name(), &by_type)?; + } else if args.is_present("rust-enum") { + let variants = by_type.keys().map(String::as_str).collect::>(); + wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?; + } else { + wtr.names(by_type.keys())?; + for (name, set) in by_type { + wtr.ranges(&name, &set)?; + } + } + + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 5ab3e80..3332085 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,6 +35,7 @@ mod case_folding; mod case_mapping; mod general_category; mod jamo_short_name; +mod joining_type; mod names; mod property_bool; mod regex; @@ -74,6 +75,7 @@ fn run() -> Result<()> { ("jamo-short-name", Some(m)) => { jamo_short_name::command(ArgMatches::new(m)) } + ("joining-type", Some(m)) => joining_type::command(ArgMatches::new(m)), ("names", Some(m)) => names::command(ArgMatches::new(m)), ("property-names", Some(m)) => cmd_property_names(ArgMatches::new(m)), ("property-values", Some(m)) => { diff --git a/ucd-parse/src/arabic_shaping.rs b/ucd-parse/src/arabic_shaping.rs index ee6929a..6dbf4b7 100644 --- a/ucd-parse/src/arabic_shaping.rs +++ b/ucd-parse/src/arabic_shaping.rs @@ -37,6 +37,19 @@ pub enum JoiningType { Transparent, } +impl JoiningType { + pub fn as_str(&self) -> &str { + match self { + JoiningType::RightJoining => "R", + JoiningType::LeftJoining => "L", + JoiningType::DualJoining => "D", + JoiningType::JoinCausing => "C", + JoiningType::NonJoining => "U", + JoiningType::Transparent => "T", + } + } +} + impl Default for JoiningType { fn default() -> JoiningType { JoiningType::NonJoining