Skip to content

Commit

Permalink
cli: add joining-type sub-command
Browse files Browse the repository at this point in the history
PR #24
  • Loading branch information
wezm authored and BurntSushi committed Jan 16, 2020
1 parent b0ae9e2 commit d88aa1a
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 23 deletions.
25 changes: 25 additions & 0 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ script-extension produces one table of Unicode codepoint ranges for each
possible Script_Extension value.
";

const ABOUT_JOINING_TYPE: &'static str = "\
joining-type produces one table of Unicode codepoint ranges for each
possible Joining_Type value.
";

const ABOUT_AGE: &'static str = "\
age produces a table for each discrete Unicode age. Each table includes the
codepoints that were added for that age. Tables can be emitted as a sorted
Expand Down Expand Up @@ -345,6 +350,25 @@ pub fn app() -> App<'static, 'static> {
"List the properties that can be generated with this \
command.",
));
let cmd_joining_type =
SubCommand::with_name("joining-type")
.author(crate_authors!())
.version(crate_version!())
.template(TEMPLATE_SUB)
.about("Create the Joining_Type property tables.")
.before_help(ABOUT_JOINING_TYPE)
.arg(ucd_dir.clone())
.arg(flag_fst_dir.clone())
.arg(flag_name("JOINING_TYPE"))
.arg(flag_chars.clone())
.arg(flag_trie_set.clone())
.arg(Arg::with_name("enum").long("enum").help(
"Emit a single table that maps codepoints to joining type.",
))
.arg(Arg::with_name("rust-enum").long("rust-enum").help(
"Emit a Rust enum and a table that maps codepoints to \
joining type.",
));
let cmd_prop_bool = SubCommand::with_name("property-bool")
.author(crate_authors!())
.version(crate_version!())
Expand Down Expand Up @@ -612,6 +636,7 @@ pub fn app() -> App<'static, 'static> {
.subcommand(cmd_general_category)
.subcommand(cmd_script)
.subcommand(cmd_script_extension)
.subcommand(cmd_joining_type)
.subcommand(cmd_age)
.subcommand(cmd_bidi_mirroring_glyph)
.subcommand(cmd_prop_bool)
Expand Down
57 changes: 34 additions & 23 deletions src/general_category.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::collections::{BTreeMap, BTreeSet};

use ucd_parse::{self, UnicodeDataExpander};
use ucd_parse::{self, UnicodeData, UnicodeDataExpander};

use args::ArgMatches;
use error::Result;
Expand All @@ -18,29 +18,8 @@ pub fn command(args: ArgMatches) -> Result<()> {
return print_property_values(&propvals, "General_Category");
}

// Expand all of our UnicodeData rows. This results in one big list of
// all assigned codepoints.
let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
let mut bycat = expand_into_categories(unexpanded, &propvals)?;

// Collect each general category into an ordered set.
let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
let mut assigned = BTreeSet::new();
for row in rows {
assigned.insert(row.codepoint.value());
let gc = propvals.canonical("gc", &row.general_category)?.to_string();
bycat
.entry(gc)
.or_insert(BTreeSet::new())
.insert(row.codepoint.value());
}
// As a special case, collect all unassigned codepoints.
let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
bycat.insert(unassigned_name.clone(), BTreeSet::new());
for cp in 0..=0x10FFFF {
if !assigned.contains(&cp) {
bycat.get_mut(&unassigned_name).unwrap().insert(cp);
}
}
// As another special case, collect all "related" groups of categories.
// But don't do this when printing an enumeration, because in an
// enumeration each codepoint should belong to exactly one category, which
Expand Down Expand Up @@ -74,6 +53,38 @@ pub fn command(args: ArgMatches) -> Result<()> {
Ok(())
}

/// Expand a list of UnicodeData rows and group by category.
pub fn expand_into_categories(
unexpanded: Vec<UnicodeData>,
propvals: &PropertyValues,
) -> Result<BTreeMap<String, BTreeSet<u32>>> {
// Expand all of our UnicodeData rows. This results in one big list of
// all assigned codepoints.
let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();

// Collect each general category into an ordered set.
let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
let mut assigned = BTreeSet::new();
for row in rows {
assigned.insert(row.codepoint.value());
let gc = propvals.canonical("gc", &row.general_category)?.to_string();
bycat
.entry(gc)
.or_insert(BTreeSet::new())
.insert(row.codepoint.value());
}
// As a special case, collect all unassigned codepoints.
let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
bycat.insert(unassigned_name.clone(), BTreeSet::new());
for cp in 0..=0x10FFFF {
if !assigned.contains(&cp) {
bycat.get_mut(&unassigned_name).unwrap().insert(cp);
}
}

Ok(bycat)
}

/// Related returns a set of sets of codepoints corresponding to the "related"
/// groups of categories defined by Table 12 in UAX#44 S5.7.1.
///
Expand Down
73 changes: 73 additions & 0 deletions src/joining_type.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use std::collections::{BTreeMap, BTreeSet};

use ucd_parse::{self, ArabicShaping};

use args::ArgMatches;
use error::Result;
use general_category;
use util::PropertyValues;

pub fn command(args: ArgMatches) -> Result<()> {
let dir = args.ucd_dir()?;
let propvals = PropertyValues::from_ucd_dir(&dir)?;
let rows: Vec<ArabicShaping> = ucd_parse::parse(&dir)?;
let unexpanded_gc = ucd_parse::parse(&dir)?;
let gc =
general_category::expand_into_categories(unexpanded_gc, &propvals)?;

// Collect each joining type into an ordered set.
let mut by_type: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
let mut assigned = BTreeSet::new();
for row in rows {
assigned.insert(row.codepoint.value());
let jt =
propvals.canonical("jt", row.joining_type.as_str())?.to_string();
by_type
.entry(jt)
.or_insert(BTreeSet::new())
.insert(row.codepoint.value());
}
// Process the codepoints that are not listed as per the note in
// ArabicShaping.txt:
//
// Note: Code points that are not explicitly listed in this file are either
// of joining type T or U:
//
// - Those that are not explicitly listed and that are of General Category
// Mn, Me, or Cf have joining type T.
// - All others not explicitly listed have joining type U.
let transparent_name = propvals.canonical("jt", "transparent")?;
let non_joining_name = propvals.canonical("jt", "non_joining")?;
let transparent_categories = ["Mn", "Me", "Cf"]
.iter()
.map(|cat| propvals.canonical("gc", cat).map(|name| &gc[&name]))
.collect::<Result<Vec<_>>>()?;
for cp in 0..=0x10FFFF {
if assigned.contains(&cp) {
continue;
}
// See if the code point is in any of the general categories that
// map to the Transparent joining type. Otherwise add to the
// Non_Joining type.
if transparent_categories.iter().any(|cat| cat.contains(&cp)) {
by_type.get_mut(&transparent_name).unwrap().insert(cp);
} else {
by_type.get_mut(&non_joining_name).unwrap().insert(cp);
}
}

let mut wtr = args.writer("joining_type")?;
if args.is_present("enum") {
wtr.ranges_to_enum(args.name(), &by_type)?;
} else if args.is_present("rust-enum") {
let variants = by_type.keys().map(String::as_str).collect::<Vec<_>>();
wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?;
} else {
wtr.names(by_type.keys())?;
for (name, set) in by_type {
wtr.ranges(&name, &set)?;
}
}

Ok(())
}
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ mod case_folding;
mod case_mapping;
mod general_category;
mod jamo_short_name;
mod joining_type;
mod names;
mod property_bool;
mod regex;
Expand Down Expand Up @@ -74,6 +75,7 @@ fn run() -> Result<()> {
("jamo-short-name", Some(m)) => {
jamo_short_name::command(ArgMatches::new(m))
}
("joining-type", Some(m)) => joining_type::command(ArgMatches::new(m)),
("names", Some(m)) => names::command(ArgMatches::new(m)),
("property-names", Some(m)) => cmd_property_names(ArgMatches::new(m)),
("property-values", Some(m)) => {
Expand Down
13 changes: 13 additions & 0 deletions ucd-parse/src/arabic_shaping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ pub enum JoiningType {
Transparent,
}

impl JoiningType {
pub fn as_str(&self) -> &str {
match self {
JoiningType::RightJoining => "R",
JoiningType::LeftJoining => "L",
JoiningType::DualJoining => "D",
JoiningType::JoinCausing => "C",
JoiningType::NonJoining => "U",
JoiningType::Transparent => "T",
}
}
}

impl Default for JoiningType {
fn default() -> JoiningType {
JoiningType::NonJoining
Expand Down

0 comments on commit d88aa1a

Please sign in to comment.