Skip to content

Commit d88aa1a

Browse files
wezmBurntSushi
authored andcommitted
cli: add joining-type sub-command
PR #24
1 parent b0ae9e2 commit d88aa1a

File tree

5 files changed

+147
-23
lines changed

5 files changed

+147
-23
lines changed

src/app.rs

+25
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ script-extension produces one table of Unicode codepoint ranges for each
5757
possible Script_Extension value.
5858
";
5959

60+
const ABOUT_JOINING_TYPE: &'static str = "\
61+
joining-type produces one table of Unicode codepoint ranges for each
62+
possible Joining_Type value.
63+
";
64+
6065
const ABOUT_AGE: &'static str = "\
6166
age produces a table for each discrete Unicode age. Each table includes the
6267
codepoints that were added for that age. Tables can be emitted as a sorted
@@ -345,6 +350,25 @@ pub fn app() -> App<'static, 'static> {
345350
"List the properties that can be generated with this \
346351
command.",
347352
));
353+
let cmd_joining_type =
354+
SubCommand::with_name("joining-type")
355+
.author(crate_authors!())
356+
.version(crate_version!())
357+
.template(TEMPLATE_SUB)
358+
.about("Create the Joining_Type property tables.")
359+
.before_help(ABOUT_JOINING_TYPE)
360+
.arg(ucd_dir.clone())
361+
.arg(flag_fst_dir.clone())
362+
.arg(flag_name("JOINING_TYPE"))
363+
.arg(flag_chars.clone())
364+
.arg(flag_trie_set.clone())
365+
.arg(Arg::with_name("enum").long("enum").help(
366+
"Emit a single table that maps codepoints to joining type.",
367+
))
368+
.arg(Arg::with_name("rust-enum").long("rust-enum").help(
369+
"Emit a Rust enum and a table that maps codepoints to \
370+
joining type.",
371+
));
348372
let cmd_prop_bool = SubCommand::with_name("property-bool")
349373
.author(crate_authors!())
350374
.version(crate_version!())
@@ -612,6 +636,7 @@ pub fn app() -> App<'static, 'static> {
612636
.subcommand(cmd_general_category)
613637
.subcommand(cmd_script)
614638
.subcommand(cmd_script_extension)
639+
.subcommand(cmd_joining_type)
615640
.subcommand(cmd_age)
616641
.subcommand(cmd_bidi_mirroring_glyph)
617642
.subcommand(cmd_prop_bool)

src/general_category.rs

+34-23
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::collections::{BTreeMap, BTreeSet};
22

3-
use ucd_parse::{self, UnicodeDataExpander};
3+
use ucd_parse::{self, UnicodeData, UnicodeDataExpander};
44

55
use args::ArgMatches;
66
use error::Result;
@@ -18,29 +18,8 @@ pub fn command(args: ArgMatches) -> Result<()> {
1818
return print_property_values(&propvals, "General_Category");
1919
}
2020

21-
// Expand all of our UnicodeData rows. This results in one big list of
22-
// all assigned codepoints.
23-
let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
21+
let mut bycat = expand_into_categories(unexpanded, &propvals)?;
2422

25-
// Collect each general category into an ordered set.
26-
let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
27-
let mut assigned = BTreeSet::new();
28-
for row in rows {
29-
assigned.insert(row.codepoint.value());
30-
let gc = propvals.canonical("gc", &row.general_category)?.to_string();
31-
bycat
32-
.entry(gc)
33-
.or_insert(BTreeSet::new())
34-
.insert(row.codepoint.value());
35-
}
36-
// As a special case, collect all unassigned codepoints.
37-
let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
38-
bycat.insert(unassigned_name.clone(), BTreeSet::new());
39-
for cp in 0..=0x10FFFF {
40-
if !assigned.contains(&cp) {
41-
bycat.get_mut(&unassigned_name).unwrap().insert(cp);
42-
}
43-
}
4423
// As another special case, collect all "related" groups of categories.
4524
// But don't do this when printing an enumeration, because in an
4625
// enumeration each codepoint should belong to exactly one category, which
@@ -74,6 +53,38 @@ pub fn command(args: ArgMatches) -> Result<()> {
7453
Ok(())
7554
}
7655

56+
/// Expand a list of UnicodeData rows and group by category.
57+
pub fn expand_into_categories(
58+
unexpanded: Vec<UnicodeData>,
59+
propvals: &PropertyValues,
60+
) -> Result<BTreeMap<String, BTreeSet<u32>>> {
61+
// Expand all of our UnicodeData rows. This results in one big list of
62+
// all assigned codepoints.
63+
let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
64+
65+
// Collect each general category into an ordered set.
66+
let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
67+
let mut assigned = BTreeSet::new();
68+
for row in rows {
69+
assigned.insert(row.codepoint.value());
70+
let gc = propvals.canonical("gc", &row.general_category)?.to_string();
71+
bycat
72+
.entry(gc)
73+
.or_insert(BTreeSet::new())
74+
.insert(row.codepoint.value());
75+
}
76+
// As a special case, collect all unassigned codepoints.
77+
let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
78+
bycat.insert(unassigned_name.clone(), BTreeSet::new());
79+
for cp in 0..=0x10FFFF {
80+
if !assigned.contains(&cp) {
81+
bycat.get_mut(&unassigned_name).unwrap().insert(cp);
82+
}
83+
}
84+
85+
Ok(bycat)
86+
}
87+
7788
/// Related returns a set of sets of codepoints corresponding to the "related"
7889
/// groups of categories defined by Table 12 in UAX#44 S5.7.1.
7990
///

src/joining_type.rs

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
use std::collections::{BTreeMap, BTreeSet};
2+
3+
use ucd_parse::{self, ArabicShaping};
4+
5+
use args::ArgMatches;
6+
use error::Result;
7+
use general_category;
8+
use util::PropertyValues;
9+
10+
pub fn command(args: ArgMatches) -> Result<()> {
11+
let dir = args.ucd_dir()?;
12+
let propvals = PropertyValues::from_ucd_dir(&dir)?;
13+
let rows: Vec<ArabicShaping> = ucd_parse::parse(&dir)?;
14+
let unexpanded_gc = ucd_parse::parse(&dir)?;
15+
let gc =
16+
general_category::expand_into_categories(unexpanded_gc, &propvals)?;
17+
18+
// Collect each joining type into an ordered set.
19+
let mut by_type: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
20+
let mut assigned = BTreeSet::new();
21+
for row in rows {
22+
assigned.insert(row.codepoint.value());
23+
let jt =
24+
propvals.canonical("jt", row.joining_type.as_str())?.to_string();
25+
by_type
26+
.entry(jt)
27+
.or_insert(BTreeSet::new())
28+
.insert(row.codepoint.value());
29+
}
30+
// Process the codepoints that are not listed as per the note in
31+
// ArabicShaping.txt:
32+
//
33+
// Note: Code points that are not explicitly listed in this file are either
34+
// of joining type T or U:
35+
//
36+
// - Those that are not explicitly listed and that are of General Category
37+
// Mn, Me, or Cf have joining type T.
38+
// - All others not explicitly listed have joining type U.
39+
let transparent_name = propvals.canonical("jt", "transparent")?;
40+
let non_joining_name = propvals.canonical("jt", "non_joining")?;
41+
let transparent_categories = ["Mn", "Me", "Cf"]
42+
.iter()
43+
.map(|cat| propvals.canonical("gc", cat).map(|name| &gc[&name]))
44+
.collect::<Result<Vec<_>>>()?;
45+
for cp in 0..=0x10FFFF {
46+
if assigned.contains(&cp) {
47+
continue;
48+
}
49+
// See if the code point is in any of the general categories that
50+
// map to the Transparent joining type. Otherwise add to the
51+
// Non_Joining type.
52+
if transparent_categories.iter().any(|cat| cat.contains(&cp)) {
53+
by_type.get_mut(&transparent_name).unwrap().insert(cp);
54+
} else {
55+
by_type.get_mut(&non_joining_name).unwrap().insert(cp);
56+
}
57+
}
58+
59+
let mut wtr = args.writer("joining_type")?;
60+
if args.is_present("enum") {
61+
wtr.ranges_to_enum(args.name(), &by_type)?;
62+
} else if args.is_present("rust-enum") {
63+
let variants = by_type.keys().map(String::as_str).collect::<Vec<_>>();
64+
wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?;
65+
} else {
66+
wtr.names(by_type.keys())?;
67+
for (name, set) in by_type {
68+
wtr.ranges(&name, &set)?;
69+
}
70+
}
71+
72+
Ok(())
73+
}

src/main.rs

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ mod case_folding;
3535
mod case_mapping;
3636
mod general_category;
3737
mod jamo_short_name;
38+
mod joining_type;
3839
mod names;
3940
mod property_bool;
4041
mod regex;
@@ -74,6 +75,7 @@ fn run() -> Result<()> {
7475
("jamo-short-name", Some(m)) => {
7576
jamo_short_name::command(ArgMatches::new(m))
7677
}
78+
("joining-type", Some(m)) => joining_type::command(ArgMatches::new(m)),
7779
("names", Some(m)) => names::command(ArgMatches::new(m)),
7880
("property-names", Some(m)) => cmd_property_names(ArgMatches::new(m)),
7981
("property-values", Some(m)) => {

ucd-parse/src/arabic_shaping.rs

+13
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,19 @@ pub enum JoiningType {
3737
Transparent,
3838
}
3939

40+
impl JoiningType {
41+
pub fn as_str(&self) -> &str {
42+
match self {
43+
JoiningType::RightJoining => "R",
44+
JoiningType::LeftJoining => "L",
45+
JoiningType::DualJoining => "D",
46+
JoiningType::JoinCausing => "C",
47+
JoiningType::NonJoining => "U",
48+
JoiningType::Transparent => "T",
49+
}
50+
}
51+
}
52+
4053
impl Default for JoiningType {
4154
fn default() -> JoiningType {
4255
JoiningType::NonJoining

0 commit comments

Comments
 (0)