Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweaks to Unihan property handling #1022

Merged
merged 15 commits into from
Jan 30, 2025
84 changes: 0 additions & 84 deletions UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.impl.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty.NameChoice;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Normalizer;
Expand Down Expand Up @@ -568,89 +567,6 @@ protected String _getValue(int codepoint) {
}
}

private static class IcuEnumProperty extends XEnumUnicodeProperty {
final int propNum;

public IcuEnumProperty(int propNum) {
super(
UCharacter.getPropertyName(propNum, NameChoice.LONG),
getValues(propNum).toArray());
this.propNum = propNum;
}

private static List<String> getValues(int propNum) {
List<String> valueList = new ArrayList<String>();
for (int i = UCharacter.getIntPropertyMinValue(propNum);
i <= UCharacter.getIntPropertyMaxValue(propNum);
++i) {
valueList.add(UCharacter.getPropertyValueName(propNum, i, NameChoice.LONG));
}
return valueList;
}

@Override
protected String _getValue(int codepoint) {
int propValue = UCharacter.getIntPropertyValue(codepoint, propNum);
try {
return UCharacter.getPropertyValueName(propNum, propValue, NameChoice.LONG);
} catch (Exception e) {
return "n/a";
}
}
}

// private static class IcuBidiPairedBracket extends SimpleProperty {
// final int propNum;
// public IcuBidiPairedBracket() {
// setName(UCharacter.getPropertyName(UProperty.BIDI_PAIRED_BRACKET,
// NameChoice.LONG));
// this.propNum = UProperty.BIDI_PAIRED_BRACKET;
// }
// @Override
// public List _getNameAliases(List result) {
// return Arrays.asList(UCharacter.getPropertyName(propNum, NameChoice.LONG),
// UCharacter.getPropertyName(propNum, NameChoice.SHORT));
// }
//
// @Override
// protected String _getValue(int codepoint) {
// return UTF16.valueOf(UCharacter.getBidiPairedBracket(codepoint));
// }
// @Override
// protected UnicodeMap _getUnicodeMap() {
// // TODO Auto-generated method stub
// return super._getUnicodeMap();
// }
// }

// private static class Usage extends XEnumUnicodeProperty {
// enum UsageValues {common, historic, deprecated, liturgical, limited, symbol,
// punctuation, na;
// public static UsageValues getValue(int codepoint) {
// if (UnicodeProperty.SPECIALS.contains(codepoint)) return na;
// if (UnicodeUtilities.DEPRECATED.contains(codepoint)) return deprecated;
// if (UnicodeUtilities.LITURGICAL.contains(codepoint)) return liturgical;
// //if (ScriptCategoriesCopy.ARCHAIC.contains(codepoint)) return historic;
// //if (UnicodeUtilities.LIM.contains(codepoint)) return archaic;
// if (UnicodeUtilities.COMMON_USE_SCRIPTS.contains(codepoint)) {
// if (UnicodeUtilities.SYMBOL.contains(codepoint)) return symbol;
// if (UnicodeUtilities.PUNCTUATION.contains(codepoint)) return punctuation;
// return common;
// }
// return limited;
// }
// }
// public Usage() {
// super("Usage", UsageValues.values());
// setType(UnicodeProperty.EXTENDED_ENUMERATED);
// }
//
// @Override
// protected String _getValue(int codepoint) {
// return UsageValues.getValue(codepoint).toString();
// }
// }

static class HanType extends XEnumUnicodeProperty {
enum HanTypeValues {
na,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -263,7 +264,7 @@ public UnicodeMap<Double> loadDouble(UcdProperty prop2) {
|| prop2 == UcdProperty.kAccountingNumeric
|| prop2 == UcdProperty.kOtherNumeric) {
// Unicode 15.1+: A character may have multiple Unihan numeric values.
pos = v.indexOf(' ');
pos = v.indexOf('|');
if (pos >= 0) {
v = value.substring(0, pos);
}
Expand Down Expand Up @@ -839,11 +840,21 @@ public List<String> _getNameAliases(List result) {
}

@Override
protected List<String> _getAvailableValues(List result) {
protected List<String> _getAvailableValues(List<String> result) {
if (stringToNamedEnum != null) {
result.addAll(enumValueNames);
return result;
}
if (isMultivalued()) {
HashSet<String> valueSet = new HashSet<>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A list hash set preserves the original order

for (var value : _getUnicodeMap().getAvailableValues()) {
for (var part : delimiterSplitter.split(value)) {
valueSet.add(part);
}
}
result.addAll(valueSet);
return result;
}
return _getUnicodeMap().getAvailableValues(result);
}

Expand Down
34 changes: 22 additions & 12 deletions unicodetools/src/main/java/org/unicode/props/UcdProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public enum UcdProperty {
Numeric_Value(PropertyType.Numeric, "nv"),
kAccountingNumeric(PropertyType.Numeric, "cjkAccountingNumeric"),
kOtherNumeric(PropertyType.Numeric, "cjkOtherNumeric"),
kPrimaryNumeric(PropertyType.Numeric, "cjkPrimaryNumeric"),
kPrimaryNumeric(PropertyType.Numeric, null, ValueCardinality.Ordered, "cjkPrimaryNumeric"),

// String
Bidi_Mirroring_Glyph(PropertyType.String, "bmg"),
Expand Down Expand Up @@ -91,7 +91,11 @@ public enum UcdProperty {
Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"),
Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"),
Unicode_1_Name(PropertyType.Miscellaneous, "na1"),
kAlternateTotalStrokes(PropertyType.Miscellaneous, "cjkAlternateTotalStrokes"),
kAlternateTotalStrokes(
PropertyType.Miscellaneous,
null,
ValueCardinality.Unordered,
"cjkAlternateTotalStrokes"),
kBigFive(PropertyType.Miscellaneous, "cjkBigFive"),
kCCCII(PropertyType.Miscellaneous, "cjkCCCII"),
kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"),
Expand All @@ -114,7 +118,7 @@ public enum UcdProperty {
kEH_IFAO(PropertyType.Miscellaneous, "kEH_IFAO"),
kEH_JSesh(PropertyType.Miscellaneous, "kEH_JSesh"),
kEH_UniK(PropertyType.Miscellaneous, "kEH_UniK"),
kFanqie(PropertyType.Miscellaneous, "cjkFanqie"),
kFanqie(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFanqie"),
kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"),
kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"),
kFourCornerCode(
Expand Down Expand Up @@ -154,7 +158,7 @@ public enum UcdProperty {
kIRG_VSource(PropertyType.Miscellaneous, "cjkIRG_VSource"),
kJIS0213(PropertyType.Miscellaneous, "cjkJIS0213"),
kJa(PropertyType.Miscellaneous, "cjkJa"),
kJapanese(PropertyType.Miscellaneous, "cjkJapanese"),
kJapanese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapanese"),
kJapaneseKun(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseKun"),
kJapaneseOn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseOn"),
kJinmeiyoKanji(
Expand All @@ -180,7 +184,7 @@ public enum UcdProperty {
kMandarin(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkMandarin"),
kMatthews(PropertyType.Miscellaneous, "cjkMatthews"),
kMeyerWempe(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMeyerWempe"),
kMojiJoho(PropertyType.Miscellaneous, "cjkMojiJoho"),
kMojiJoho(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMojiJoho"),
kMorohashi(PropertyType.Miscellaneous, "cjkMorohashi"),
kNelson(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkNelson"),
kPhonetic(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkPhonetic"),
Expand All @@ -201,31 +205,37 @@ public enum UcdProperty {
"URS"),
kReading(PropertyType.Miscellaneous, "kReading"),
kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"),
kSMSZD2003Index(PropertyType.Miscellaneous, "cjkSMSZD2003Index"),
kSMSZD2003Readings(PropertyType.Miscellaneous, "cjkSMSZD2003Readings"),
kSMSZD2003Index(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"),
kSMSZD2003Readings(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"),
kSemanticVariant(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"),
kSpecializedSemanticVariant(
PropertyType.Miscellaneous,
null,
ValueCardinality.Unordered,
"cjkSpecializedSemanticVariant"),
kSpoofingVariant(PropertyType.Miscellaneous, "cjkSpoofingVariant"),
kSpoofingVariant(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"),
kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"),
kStrange(PropertyType.Miscellaneous, "cjkStrange"),
kStrange(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkStrange"),
kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"),
kTGHZ2013(PropertyType.Miscellaneous, "cjkTGHZ2013"),
kTGHZ2013(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGHZ2013"),
kTGT_MergedSrc(PropertyType.Miscellaneous, "kTGT_MergedSrc"),
kTaiwanTelegraph(PropertyType.Miscellaneous, "cjkTaiwanTelegraph"),
kTang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTang"),
kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"),
kUnihanCore2020(PropertyType.Miscellaneous, "cjkUnihanCore2020"),
kVietnamese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnamese"),
kVietnameseNumeric(PropertyType.Miscellaneous, "cjkVietnameseNumeric"),
kVietnameseNumeric(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"),
kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"),
kXerox(PropertyType.Miscellaneous, "cjkXerox"),
kZVariant(PropertyType.Miscellaneous, "cjkZVariant"),
kZhuangNumeric(PropertyType.Miscellaneous, "cjkZhuangNumeric"),
kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"),
kZhuangNumeric(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"),

// Catalog
Age(PropertyType.Catalog, Age_Values.class, null, "age"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1579,6 +1579,7 @@ public static kEH_Core_Values forName(String name) {
// kVietnameseNumeric
// kXerox
// kXHC1983
// kZhuang
// kZhuangNumeric
// kZVariant
public enum Line_Break_Values implements Named {
Expand Down
13 changes: 11 additions & 2 deletions unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -157,7 +158,7 @@ public static synchronized void ResetCacheProperties() {
private boolean isMultivalued = false;

private String delimiter = ",";
private Splitter delimiterSplitter = Splitter.on(delimiter);
protected Splitter delimiterSplitter = Splitter.on(delimiter);

public UnicodeProperty setMultivalued(boolean value) {
isMultivalued = value;
Expand Down Expand Up @@ -263,6 +264,12 @@ public String getVersion() {
return _getVersion();
}

public Iterable<String> getValues(int codepoint) {
return isMultivalued
? delimiterSplitter.split(getValue(codepoint))
: Collections.singleton(getValue(codepoint));
}

public String getValue(int codepoint) {
if (DEBUG && CHECK_VALUE == codepoint && CHECK_NAME.equals(getName())) {
String value = _getValue(codepoint);
Expand Down Expand Up @@ -290,8 +297,10 @@ public List<String> getValueAliases(String valueAlias, List<String> result) {
if (result == null) result = new ArrayList<>(1);
result = _getValueAliases(valueAlias, result);
if (!result.contains(valueAlias)) { // FIX && type < NUMERIC
if (type == MISC) {
if (type == MISC || type == NUMERIC) {
// Unihan has multivalued properties but does not use aliases.
// The concept of aliases does not really apply to numeric properties,
// but we should apply UAX44-LM1. We don’t, though.
result.add(valueAlias);
} else {
result = _getValueAliases(valueAlias, result); // for debugging
Expand Down
55 changes: 21 additions & 34 deletions unicodetools/src/main/java/org/unicode/text/UCD/UCD.java
Original file line number Diff line number Diff line change
Expand Up @@ -522,41 +522,28 @@ static class HanException {
}

private void populateHanExceptions(UnicodeProperty numeric) {
for (String value : numeric.getAvailableValues()) {
if (value == null || value.equals("NaN")) {
continue;
}
String propertyValue = Utility.replace(value, ",", "");
final int hack = propertyValue.indexOf(' ');
if (hack >= 0) {
Utility.fixDot();
if (SHOW_LOADING) {
System.out.println("BAD NUMBER: " + value);
}
propertyValue = propertyValue.substring(0, hack);
}

for (String s : numeric.getSet(value)) {
final int code = s.codePointAt(0);
// Unicode 15.1:
// This code had these two exceptions, but now U+4EAC actually has value
// 10000000000000000
// and we want to see that in DerivedNumericValues.txt,
// so we stop making these exceptions.
if (compositeVersion < 0xf0100 && (code == 0x5793 || code == 0x4EAC)) {
continue; // two exceptions!!
}

HanException except = (HanException) hanExceptions.get(code);
if (except != null) {
throw new IllegalArgumentException(
"Duplicate Numeric Value for U+" + Utility.hex(code));
}
except = new HanException();
hanExceptions.put(code, except);
except.numericValue = Double.parseDouble(propertyValue);
except.numericType = NUMERIC;
for (final int code : numeric.getSet("NaN").complement().codePoints()) {
// Unicode 15.1:
// This code had these two exceptions, but now U+4EAC actually has value
// 10000000000000000
// and we want to see that in DerivedNumericValues.txt,
// so we stop making these exceptions.
// NOTE(egg): These two exceptions (we are in a function called exceptions, so these are
// exceptions to the broader exception that is Han numeric values) were made irrelevant
// sometime before Unicode 5.2. See L2/03-094 for background.
if (compositeVersion < 0xf0100 && (code == 0x5793 || code == 0x4EAC)) {
continue; // two exceptions!!
}

HanException except = (HanException) hanExceptions.get(code);
if (except != null && false) {
throw new IllegalArgumentException(
"Duplicate Numeric Value for U+" + Utility.hex(code));
}
except = new HanException();
hanExceptions.put(code, except);
except.numericValue = Double.parseDouble(numeric.getValues(code).iterator().next());
except.numericType = NUMERIC;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ cjkVietnameseNumeric ; kVietnameseNumeric
cjkZhuangNumeric ; kZhuangNumeric
# 16.0
cjkFanqie ; kFanqie
cjkZhuang ; kZhuang

kTGT_MergedSrc ; kTGT_MergedSrc
kRSTUnicode ; kRSTUnicode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ kCCCII ; EXTENSIBLE ; [0-9A-F]{6}
kEACC ; SINGLE_VALUED ; [0-9A-F]{6}
kAccountingNumeric ; SINGLE_VALUED ; [0-9]+
kOtherNumeric ; SINGLE_VALUED ; [0-9]+
kPrimaryNumeric ; SINGLE_VALUED ; [0-9]+
kPrimaryNumeric ; ORDERED ; [0-9]+
kFenn ; MULTI_VALUED ; [0-9]+a?[A-KP*]
kCowles ; MULTI_VALUED ; [0-9]{1,4}(\.[0-9]{1,2})?
kXerox ; SINGLE_VALUED ; [0-9]{3}:[0-9]{3}
Expand Down Expand Up @@ -176,11 +176,29 @@ kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2}
kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})*
kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3}


kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}


# Unihan properties from 13.0 and later. No regexes for now.
# TODO(egg): We should automate the updating of the regexes from UAX #38.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally the fields from the table would be in a machine-readable format, and the table generated from them, and our usage also.

I initially generated by dumping the table into a spreadsheet, then using formulæ to transform a bit, eg:

Property kJis1
Status Provisional
Category Other Mappings
Introduced 2
Delimiter space
Syntax [0-9]{4}
Description The JIS X 0212-1990 mapping for this ideograph in row-cell form.

=>

kJapaneseKun Status Provisional
kJapaneseKun Category Readings
kJapaneseKun Introduced 2
kJapaneseKun Delimiter space
kJapaneseKun Syntax [A-Z]+
kJapaneseKun Description The Japanese pronunciation(s) of this ideograph in the Hepburn romanization.

Then extract the delimiter and syntax for each property; but then also check the text for the ones with delimiters to see whether they were ordered or not.

However, I didn't keep up to date (obviously), so it needs a better process.

kSpoofingVariant ; MULTI_VALUED ; .*
kTGHZ2013 ; MULTI_VALUED ; .*
kUnihanCore2020 ; SINGLE_VALUED ; .*
# 14.0
kStrange ; MULTI_VALUED ; .*
# 15.0
kAlternateTotalStrokes ; MULTI_VALUED ; .*
# 15.1
kJapanese ; MULTI_VALUED ; .*
kMojiJoho ; MULTI_VALUED ; .*
kSMSZD2003Index ; MULTI_VALUED ; .*
kSMSZD2003Readings ; MULTI_VALUED ; .*
kVietnameseNumeric ; MULTI_VALUED ; .*
kZhuangNumeric ; MULTI_VALUED ; .*
# 16.0
kFanqie ; MULTI_VALUED ; .*
kZhuang ; MULTI_VALUED ; .*

# =============================
# Catalog/Enum/Binary Properties
# All not listed are SINGLE_VALUED ; null
Expand Down
Loading
Loading