diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/api/CldrDataSupplier.java b/tools/cldr-code/src/main/java/org/unicode/cldr/api/CldrDataSupplier.java index d75c01e898e..8e0da28354e 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/api/CldrDataSupplier.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/api/CldrDataSupplier.java @@ -9,9 +9,9 @@ import com.google.common.collect.ImmutableSetMultimap; import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.Multimap; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Set; @@ -268,7 +268,7 @@ private static Stream listXmlFiles(Path dir) { try { return Files.walk(dir).filter(IS_XML_FILE); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/api/XmlDataSource.java b/tools/cldr-code/src/main/java/org/unicode/cldr/api/XmlDataSource.java index 3bc55b94dc8..6490dc43a22 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/api/XmlDataSource.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/api/XmlDataSource.java @@ -14,9 +14,9 @@ import com.google.common.collect.ImmutableSortedMap; import com.google.common.collect.Maps; import com.google.common.collect.Multiset; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.IOException; import java.io.Reader; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -139,7 +139,7 @@ private static Reader openFile(Path p) { try { return Files.newBufferedReader(p); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } @@ -158,7 +158,7 @@ private void read(ValueVisitor visitor, CldrDataType dtdType, boolean validating src.setSystemId(p.toString()); parseXml(xmlReader, src, p); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateAdditionalLikely.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateAdditionalLikely.java deleted file mode 100644 index d1d1d73ff14..00000000000 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateAdditionalLikely.java +++ /dev/null @@ -1,540 +0,0 @@ -package org.unicode.cldr.tool; - -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Output; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.unicode.cldr.util.CLDRConfig; -import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRFile.ExemplarType; -import org.unicode.cldr.util.CLDRPaths; -import org.unicode.cldr.util.CLDRTool; -import org.unicode.cldr.util.Factory; -import org.unicode.cldr.util.Iso639Data; -import org.unicode.cldr.util.Iso639Data.Type; -import org.unicode.cldr.util.LanguageTagParser; -import org.unicode.cldr.util.StandardCodes.LstrType; -import org.unicode.cldr.util.Validity; -import org.unicode.cldr.util.Validity.Status; - -/** TODO: Merge into GenerateMaximalLocales, see CLDR-16380 */ -@CLDRTool( - description = "Generate additional likely subtag data, see CLDR-16380", - url = "https://unicode-org.atlassian.net/browse/CLDR-16380", - alias = "generate-additional-likely") -public class GenerateAdditionalLikely { - - private static final String SIL = "sil1"; - private static final boolean ADD_SEED_EXEMPLARS = false; - - private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); - private static final Splitter UNDERBAR = Splitter.on('_'); - private static final Splitter TAB_SPLITTER = Splitter.on('\t'); - - private static final Factory factory = CLDR_CONFIG.getExemplarsFactory(); - private static final CLDRFile english = CLDR_CONFIG.getEnglish(); - private static final LanguageTagParser ltpFull = new LanguageTagParser(); - private static final LanguageTagParser ltpTag = new LanguageTagParser(); - private static final Validity validity = Validity.getInstance(); - - private static final Set LANGUAGE_REGULAR = - validity.getStatusToCodes(LstrType.language).get(Status.regular); - private static final Set SCRIPT_REGULAR = - validity.getStatusToCodes(LstrType.script).get(Status.regular); - private static final Set REGION_REGULAR = - validity.getStatusToCodes(LstrType.region).get(Status.regular); - - private static final Set LIKELY_SPECIALS = - ImmutableSet.of("in", "iw", "ji", "jw", "mo"); - private static final Set FIX_VALIDITY = ImmutableSet.of("Zanb"); - private static final Set FIX_COUNTRY = ImmutableSet.of("yi"); - - static class LSRSource implements Comparable { - final Row.R4 data; - - LSRSource(String lang, String script, String region, String source) { - if (script.contains("Soyo") || region.contains("Soyo")) { - int debug = 0; - } - data = Row.of(lang, script, region, source); - data.freeze(); - } - - @Override - public String toString() { - return combineLSR(data.get0(), data.get1(), data.get2()) + " // " + data.get3(); - } - - @Override - public int compareTo(LSRSource o) { - return data.compareTo(o.data); - } - - @Override - public int hashCode() { - return data.hashCode(); - } - - @Override - public boolean equals(Object obj) { - return data.equals(obj); - } - - public String line(String source) { - // TODO Auto-generated method stub - // - // - final String target = combineLSR(data.get0(), data.get1(), data.get2()); - final String origin = data.get3(); - final String result = - "" - + "\t"; - return result; - } - - public static String combineLSR(String lang, String script, String region) { - return lang - + (script.isEmpty() ? "" : "_" + script) - + (region.isEmpty() ? "" : "_" + region); - } - } - - private static boolean isOk( - String lang, String script, String region, Map errors) { - errors.clear(); - if (!LIKELY_SPECIALS.contains(lang)) { - check(LstrType.language, lang, errors); - } - if (!FIX_VALIDITY.contains(script)) { - check(LstrType.script, script, errors); - } - if (region.equals("001") && Iso639Data.getType(lang) == Type.Constructed) { - // ok - } else { - check(LstrType.region, region, errors); - } - return errors.isEmpty(); - } - - private static void check(LstrType lstrType, String lang, Map errors) { - final Status status = validity.getCodeToStatus(lstrType).get(lang); - if (status != Status.regular) { - errors.put(lstrType, status); - } - } - - private static class LikelySources { - private static LikelySources SINGLETON = new LikelySources(); - - public static Set getSources() { - return SINGLETON.alreadyLangs; - } - - final ImmutableSet alreadyLangs; - - private LikelySources() { - Map errors = new TreeMap<>(); - Map likely = CLDR_CONFIG.getSupplementalDataInfo().getLikelySubtags(); - Set _alreadyLangs = new TreeSet<>(); - _alreadyLangs.add("und"); - likely.forEach( - (key, value) -> { - String lang = ltpFull.set(value).getLanguage(); - String script = ltpFull.set(value).getScript(); - String region = ltpFull.set(value).getRegion(); - _alreadyLangs.add(lang); - if (!isOk(lang, script, region, errors)) { - showSkip("Skipping scope, CLDR", key, value, errors); - } - }); - System.out.println(); - alreadyLangs = ImmutableSet.copyOf(_alreadyLangs); - } - } - - static Multimap langToRegion; - - public static void main(String[] args) { - - Map result = new TreeMap<>(); - Map errors = new TreeMap<>(); - - Errors processErrors = new Errors(); - - langToRegion = readWikidata(LikelySources.getSources()); - readJson(LikelySources.getSources(), result, processErrors); - - processErrors.printAll(); - - if (ADD_SEED_EXEMPLARS) { - - for (String locale : factory.getAvailable()) { - CLDRFile file = factory.make(locale, false); - UnicodeSet exemplars = file.getExemplarSet(ExemplarType.main, null); - String lang = ltpFull.set(locale).getLanguage(); - if (!LikelySources.getSources().contains(lang)) { - String script = getScript(exemplars); - Collection regions = langToRegion.get(lang); - for (String region : regions) { - addIfOk(result, lang, lang, script, region, "wiki+exemplars", errors); - } - } - } - } - System.out.println(); - - Multimap defects = LinkedHashMultimap.create(); - - for (Entry entry : result.entrySet()) { - String source = entry.getKey(); - LSRSource lsrs = entry.getValue(); - String tagLang = ltpTag.set(source).getLanguage(); - if (!result.containsKey(tagLang)) { - defects.put(source, tagLang); - showError("Missing lang record", source, lsrs.toString(), "Needs\t" + tagLang); - } - } - - System.out.println("\nData to add: " + (result.entrySet().size() - defects.size()) + "\n"); - - for (Entry entry : result.entrySet()) { - String source = entry.getKey(); - if (defects.containsKey(source)) { - continue; - } - LSRSource lsrs = entry.getValue(); - System.out.println("\t\t" + lsrs.line(source)); - } - - // Multimap likelyAdditions = TreeMultimap.create(); - // System.out.println("\nAdd"); - // likelyAdditions.asMap().entrySet().forEach(x -> { - // String key = x.getKey(); - // if (x.getValue().size() == 1) { - // for (String value : x.getValue()) { - // System.out.println(key + "\t" + value + "\t" + infoFields(value)); - // } - // } - // } - // ); - // - // System.out.println("\nFix & Add"); - // - // likelyAdditions.asMap().entrySet().forEach(x -> { - // String key = x.getKey(); - // if (x.getValue().size() != 1) { - // for (String value : x.getValue()) { - // System.out.println(key + "\t" + value + "\t" + infoFields(value)); - // } - // System.out.println(); - // } - // } - // ); - - } - - static ImmutableMap remap = ImmutableMap.of("iw", "he", "jw", "jv"); - - private static void list(String string) { - for (String code : string.split(" ")) { - ltpFull.set(code.replace("-", "_")); - String lang = ltpFull.getLanguage(); - String cldrLang = remap.get(lang); - if (cldrLang != null) { - lang = cldrLang; - } - - System.out.println( - code - + "\t" - + english.getName(code) - + "\t" - + Iso639Data.getType(lang) - + "\t" - + Iso639Data.getScope(lang)); - } - System.out.println(); - } - - public static void showSkip( - String message, String source, String target, Map errors) { - showError(message, source, target, infoFields(target) + "\t" + errors); - } - - public static void showError(String message, String source, String target, String errors) { - System.out.println( - message + "\t" + source + " ➡ " + target + (errors.isEmpty() ? "" : "\t" + errors)); - } - - private static String infoFields(String value) { - int under = value.indexOf('_'); - String lang = under < 0 ? value : value.substring(0, under); - return english.getName(value) - + "\t" - + Iso639Data.getScope(lang) - + "\t" - + Iso639Data.getType(lang); - } - - // add , status - - // private static void handle(Entry original, Multimap - // likelyAdditions) { - // String source = original.getKey(); - // LSRSource lsr = original.getValue(); - // if (source.contains("_")) { - // int debug = 0; - // } - // // it is ok if there is a single LSR, eg - // // eg aaa Ghotuo {Latn={NG=[sil]}} - // // eg aak Ankave {Latn={PG=[sil, wiki+exemplars]}} - // - // for (Entry, String> entry : lsr.data) { - // addKeys(source, entry.getKey(), entry.getValue(), likelyAdditions); - // } - // } - - // private static void addKeys(String source, R3 r3, String comment, - // Multimap likelyAdditions) { - // likelyAdditions.put(source, r3.get0() + "_" + r3.get1() + "_" + r3.get2() + comment); - // } - - static final Pattern fullTagMatch = Pattern.compile("\\s*\"(full|tag)\": \"([^\"]+)\","); - - private static class Errors { - public enum Type { - ill_formed_tags("Ill-formed tags"), - already_CLDR("Language already in CLDR"), - tag_not_in_full("tag ⊄ full"), - exception("exception"); - private final String printable; - - private Type(String printable) { - this.printable = printable; - } - } - - public Multimap data = TreeMultimap.create(); - - public void put( - Type illFormedTags, String tagValue, String fullValue, String errorMessage) { - data.put( - illFormedTags, - tagValue - + " ➡ " - + fullValue - + (errorMessage == null || errorMessage.isEmpty() - ? "" - : "\t—\t" + errorMessage)); - } - - public void printAll() { - for (Entry> entry : data.asMap().entrySet()) { - Type type = entry.getKey(); - System.out.println(); - for (String message : entry.getValue()) { - System.out.println(type + "\t" + message); - } - } - } - } - - private static Map readJson( - Set alreadyLangs, Map result, Errors processErrors) { - Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/langtags.json"); - Matcher full = fullTagMatch.matcher(""); - Map errors = new TreeMap<>(); - - Output lastFull = new Output<>(); - try { - Files.lines(path) - .forEach( - x -> { - if (full.reset(x).matches()) { - final String key = full.group(1); - final String value = full.group(2).replace("-", "_"); - if (value.startsWith("aai")) { - int debug = 0; - } - switch (key) { - case "full": - lastFull.value = value; - break; - case "tag": - try { - String fullLang = - ltpFull.set(lastFull.value).getLanguage(); - if (alreadyLangs.contains(fullLang)) { - processErrors.put( - Errors.Type.already_CLDR, - value, - lastFull.value, - ""); - break; - } else if (isIllFormed(lastFull.value, ltpFull) - || isIllFormed(value, ltpTag.set(value))) { - processErrors.put( - Errors.Type.ill_formed_tags, - value, - lastFull.value, - ""); - } else { - String reference = SIL; - final String fullScript = ltpFull.getScript(); - String fullRegion = ltpFull.getRegion(); - if (fullRegion.equals("ZZ") - || fullRegion.equals("001")) { - Collection tempRegions = - langToRegion.get( - fullLang); // synthesize - if (!tempRegions.isEmpty()) { - fullRegion = - tempRegions.iterator().next(); - reference += " wikidata"; - } - } - - String tagLang = ltpTag.getLanguage(); - String tagScript = ltpTag.getScript(); - String tagRegion = ltpTag.getRegion(); - - if (!tagLang.equals(fullLang) - || (!tagScript.isEmpty() - && !tagScript.equals( - fullScript)) - || (!tagRegion.isEmpty() - && !tagRegion.equals( - fullRegion))) { - processErrors.put( - Errors.Type.tag_not_in_full, - value, - lastFull.value, - ""); - } else { - addIfOk( - result, - value, - fullLang, - fullScript, - fullRegion, - reference, - errors); - } - } - } catch (Exception e) { - processErrors.put( - Errors.Type.exception, - value, - lastFull.value, - e.getMessage()); - } - break; - default: - throw new IllegalArgumentException(); // never happens - } - } - }); - return result; - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } - } - - private static boolean isIllFormed(String source, LanguageTagParser languageTagParser) { - return languageTagParser.getLanguage().isEmpty() - || !languageTagParser.getVariants().isEmpty() - || !languageTagParser.getExtensions().isEmpty() - || !languageTagParser.getLocaleExtensions().isEmpty() - || source.contains("@"); - } - - private static void addIfOk( - Map result, - String source, - String lang, - final String script, - final String region, - String reference, - Map errors) { - if (isOk(lang, script, region, errors)) { - add(result, source, lang, script, region, reference); - } else { - showSkip("Skipping scope, SIL", source, ltpFull.toString(), errors); - } - } - - private static Multimap readWikidata(Set alreadyLangs) { - Multimap result = TreeMultimap.create(); - Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/wididata_lang_region.tsv"); - try { - Files.lines(path) - .forEach( - x -> { - if (!x.startsWith("#")) { - List list = TAB_SPLITTER.splitToList(x); - String lang = list.get(1); - String region = list.get(3); - result.put(lang, region); - } - }); - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } - return result; - } - - private static void add( - Map result, - String source, - String lang, - final String script, - final String region, - String reference) { - LSRSource old = result.get(source); - LSRSource newVersion = new LSRSource(lang, script, region, reference); - if (old != null && !old.equals(newVersion)) { - throw new IllegalArgumentException( - "Data already exists for " + source + ": old=" + old + ", new: " + newVersion); - } - result.put(source, newVersion); - } - - private static String getScript(UnicodeSet exemplars) { - for (String s : exemplars) { - int scriptNum = UScript.getScript(s.codePointAt(0)); - if (scriptNum != UScript.COMMON && scriptNum != UScript.INHERITED) { - return UScript.getShortName(scriptNum); - } - } - return "Zxxx"; - } -} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtagTests.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtagTests.java index 73c40e3561b..6bc9f8b388d 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtagTests.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtagTests.java @@ -6,17 +6,21 @@ import java.util.Map; import java.util.Set; import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.tool.GenerateMaximalLocales.OutputStyle; +import org.unicode.cldr.tool.GenerateLikelySubtags.OutputStyle; +import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.LanguageTagParser; +import org.unicode.cldr.util.LocaleNames; import org.unicode.cldr.util.SupplementalDataInfo; public class GenerateLikelySubtagTests { private static final String SEPARATOR = CldrUtility.LINE_SEPARATOR; private static final OutputStyle OUTPUT_STYLE = OutputStyle.XML; private static PrintWriter out; + private static CLDRConfig CONFIG = CLDRConfig.getInstance(); + private static CLDRFile ENGLISH = CONFIG.getEnglish(); private static final String VERSION = CLDRFile.GEN_VERSION; @@ -133,7 +137,7 @@ private static void writeTestLine2( } private static String printNameOrError(final String maxFrom) { - String result = GenerateMaximalLocales.printingName(maxFrom, ""); + String result = printingName(maxFrom, ""); if (result == null) { return "ERROR"; } @@ -141,7 +145,7 @@ private static String printNameOrError(final String maxFrom) { } private static String getNameOrError(final String from) { - String result = GenerateMaximalLocales.toAlt(from, true); + String result = toAlt(from, true); if (result == null) { return "ERROR"; } @@ -155,4 +159,58 @@ private static String getItem(String from) { } return "\"" + toAlt + "\""; } + + private static final String[][] ALT_REVERSAL = { + // { "no", "nb" }, + // { "nb", "no" }, + {"he", "iw"}, + {"iw", "he"}, + }; + + public static String toAlt(String locale, boolean change) { + if (!change || locale == null) { + return locale; + } + String firstTag = getFirstTag(locale); + for (String[] pair : ALT_REVERSAL) { + if (firstTag.equals(pair[0])) { + locale = pair[1] + locale.substring(pair[1].length()); + break; + } + } + locale = locale.replace("_", "-"); + return locale; + } + + private static String getFirstTag(String locale) { + int pos = locale.indexOf('_'); + return pos < 0 ? locale : locale.substring(0, pos); + } + + public static String printingName(String locale, String spacing) { + if (locale == null) { + return null; + } + LanguageTagParser parser = new LanguageTagParser().set(locale); + String lang = parser.getLanguage(); + String script = parser.getScript(); + String region = parser.getRegion(); + return "{" + + spacing + + (lang.equals(LocaleNames.UND) + ? "?" + : ENGLISH.getName(CLDRFile.LANGUAGE_NAME, lang)) + + ";" + + spacing + + (script == null || script.equals("") + ? "?" + : ENGLISH.getName(CLDRFile.SCRIPT_NAME, script)) + + ";" + + spacing + + (region == null || region.equals("") + ? "?" + : ENGLISH.getName(CLDRFile.TERRITORY_NAME, region)) + + spacing + + "}"; + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index 73cb1248901..5619deb1886 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -31,8 +31,6 @@ import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.Info; -import org.unicode.cldr.tool.GenerateMaximalLocales.LocaleOverride; -import org.unicode.cldr.tool.GenerateMaximalLocales.LocaleStringComparator; import org.unicode.cldr.tool.LangTagsData.Errors; import org.unicode.cldr.tool.Option.Options; import org.unicode.cldr.tool.Option.Params; @@ -63,6 +61,18 @@ */ public class GenerateLikelySubtags { + public enum OutputStyle { + PLAINTEXT, + C, + C_ALT, + XML + } + + public enum LocaleOverride { + KEEP_EXISTING, + REPLACE_EXISTING + } + private static final Joiner JOIN_TAB = Joiner.on('\t').useForNull("∅"); private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); @@ -626,11 +636,8 @@ public static String getPart(CLDRLocale loc, LstrType type) { } } - /** - * Compare locales, first by count of components (handling und), then by language, script, and - * finally region - */ - static Comparator LOCALE_SOURCE = + /** Compare locales, putting und.* last. */ + public static Comparator LOCALE_SOURCE = new Comparator<>() { @Override @@ -640,7 +647,6 @@ public int compare(String locale1, String locale2) { // sort items with 0 components first, then 1, then 2 (there won't be 3) int result = ComparisonChain.start() - // .compare(getMask(l1), getMask(l2)) .compare(getLanguage(l1), getLanguage(l2)) .compare(getScript(l1), getScript(l2)) .compare(getRegion(l1), getRegion(l2)) @@ -686,13 +692,6 @@ public static String getNameSafe(String oldValue) { return "n/a"; } - enum OutputStyle { - PLAINTEXT, - C, - C_ALT, - XML - } - private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML").toUpperCase()); @@ -1710,7 +1709,7 @@ public static void printLine( Map origins, boolean first, PrintWriter out) { - Set keys = new TreeSet<>(new LocaleStringComparator()); + Set keys = new TreeSet<>(LOCALE_SOURCE); keys.addAll(toPrint.keySet()); boolean noUndYet = true; for (String printingLocale : keys) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateMaximalLocales.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateMaximalLocales.java deleted file mode 100644 index b74ed4bc018..00000000000 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateMaximalLocales.java +++ /dev/null @@ -1,2480 +0,0 @@ -package org.unicode.cldr.tool; - -import com.google.common.base.Joiner; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.impl.Row.R4; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.draft.ScriptMetadata; -import org.unicode.cldr.draft.ScriptMetadata.Info; -import org.unicode.cldr.util.Builder; -import org.unicode.cldr.util.CLDRConfig; -import org.unicode.cldr.util.CLDRFile; -import org.unicode.cldr.util.CLDRLocale; -import org.unicode.cldr.util.CLDRPaths; -import org.unicode.cldr.util.CldrUtility; -import org.unicode.cldr.util.Containment; -import org.unicode.cldr.util.Counter; -import org.unicode.cldr.util.Factory; -import org.unicode.cldr.util.Iso3166Data; -import org.unicode.cldr.util.Iso639Data; -import org.unicode.cldr.util.Iso639Data.Scope; -import org.unicode.cldr.util.LanguageTagParser; -import org.unicode.cldr.util.LocaleIDParser; -import org.unicode.cldr.util.LocaleNames; -import org.unicode.cldr.util.Organization; -import org.unicode.cldr.util.PatternCache; -import org.unicode.cldr.util.SimpleFactory; -import org.unicode.cldr.util.StandardCodes; -import org.unicode.cldr.util.StandardCodes.LstrType; -import org.unicode.cldr.util.SupplementalDataInfo; -import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; -import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; -import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; -import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; -import org.unicode.cldr.util.Validity; -import org.unicode.cldr.util.Validity.Status; - -/** - * Problems: "und_Hani", "zh_Hani" "und_Sinh", "si_Sinh" - * - * @author markdavis - */ -public class GenerateMaximalLocales { - - private static final Map LANGUAGE_CODE_TO_STATUS = - Validity.getInstance().getCodeToStatus(LstrType.language); - - private static final String TEMP_UNKNOWN_REGION = "XZ"; - - private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; - - private static final boolean SHOW_ADD = - CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); - private static final boolean SUPPRESS_CHANGES = - CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); - private static final boolean SHOW_CONTAINERS = false; - - private static final boolean SHOW_ALL_LANGUAGE_CODES = false; - private static final boolean SHOW_DETAILED = false; - private static final boolean SHOW_INCLUDED_EXCLUDED = false; - - enum OutputStyle { - PLAINTEXT, - C, - C_ALT, - XML - } - - private static OutputStyle OUTPUT_STYLE = - OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML").toUpperCase()); - - // set based on above - private static final String SEPARATOR = - OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT - ? CldrUtility.LINE_SEPARATOR - : "\t"; - private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; - // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; - - private static final boolean tryDifferent = true; - - private static final File list[] = { - new File(CLDRPaths.MAIN_DIRECTORY), - new File(CLDRPaths.SEED_DIRECTORY), - new File(CLDRPaths.EXEMPLARS_DIRECTORY) - }; - - private static Factory factory = SimpleFactory.make(list, ".*"); - private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory(); - private static SupplementalDataInfo supplementalData = - SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); - private static StandardCodes standardCodes = StandardCodes.make(); - private static CLDRFile english = factory.make("en", false); - static Relation cldrContainerToLanguages = - Relation.of(new HashMap>(), HashSet.class); - - static { - for (CLDRLocale locale : - ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { - String region = locale.getCountry(); - if (region == null || region.isEmpty() || Containment.isLeaf(region)) { - continue; - } - cldrContainerToLanguages.put(region, locale.getLanguage()); - } - cldrContainerToLanguages.freeze(); - System.out.println("Keep containers " + cldrContainerToLanguages); - } - - private static final List KEEP_TARGETS = - Arrays.asList("und_Arab_PK", "und_Latn_ET", "hi_Latn"); - private static final ImmutableSet deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); - - /** - * This is the simplest way to override, by supplying the max value. It gets a very low weight, - * so doesn't override any stronger value. - */ - private static final String[] MAX_ADDITIONS = - new String[] { - "bss_Latn_CM", - "gez_Ethi_ET", - "ken_Latn_CM", - "und_Arab_PK", - "wa_Latn_BE", - "fub_Arab_CM", - "fuf_Latn_GN", - "kby_Arab_NE", - "kdh_Latn_TG", - "apd_Arab_TG", - "zlm_Latn_TG", - "cr_Cans_CA", - "hif_Latn_FJ", - "gon_Telu_IN", - "lzz_Latn_TR", - "lif_Deva_NP", - "unx_Beng_IN", - "unr_Beng_IN", - "ttt_Latn_AZ", - "pnt_Grek_GR", - "tly_Latn_AZ", - "tkr_Latn_AZ", - "bsq_Bass_LR", - "ccp_Cakm_BD", - "blt_Tavt_VN", - "rhg_Arab_MM", - "rhg_Rohg_MM", - "clc_Latn_CA", - "crg_Latn_CA", - "hur_Latn_CA", - "kwk_Latn_CA", - "lil_Latn_CA", - "ojs_Cans_CA", - "oka_Latn_CA", - "pqm_Latn_CA", - "hi_Latn_IN", - "no_Latn_NO", - "tok_Latn_001", - "prg_Latn_PL", - "ie_Latn_EE", - }; - - /** - * The following overrides MASH the final values, so they may not result in consistent results. - * Safer is to add to MAX_ADDITIONS. However, if you add, add both the language and - * language+script mappings. - */ - // Many of the overrides below can be removed once the language/pop/country data is updated. - private static final Map LANGUAGE_OVERRIDES = - CldrUtility.asMap( - new String[][] { - {"cic", "cic_Latn_US"}, - {"cic_Latn", "cic_Latn_US"}, - {"eo", "eo_Latn_001"}, - {"eo_Latn", "eo_Latn_001"}, - {"es", "es_Latn_ES"}, - {"es_Latn", "es_Latn_ES"}, - {"ff_BF", "ff_Latn_BF"}, - {"ff_GM", "ff_Latn_GM"}, - {"ff_GH", "ff_Latn_GH"}, - {"ff_GW", "ff_Latn_GW"}, - {"ff_LR", "ff_Latn_LR"}, - {"ff_NE", "ff_Latn_NE"}, - {"ff_NG", "ff_Latn_NG"}, - {"ff_SL", "ff_Latn_SL"}, - {"ff_Adlm", "ff_Adlm_GN"}, - {"ia", "ia_Latn_001"}, - {"ia_Latn", "ia_Latn_001"}, - {"io", "io_Latn_001"}, - {"io_Latn", "io_Latn_001"}, - {"jbo", "jbo_Latn_001"}, - {"jbo_Latn", "jbo_Latn_001"}, - {"ku_Arab", "ku_Arab_IQ"}, - {"lrc", "lrc_Arab_IR"}, - {"lrc_Arab", "lrc_Arab_IR"}, - {"man", "man_Latn_GM"}, - {"man_Latn", "man_Latn_GM"}, - {"mas", "mas_Latn_KE"}, - {"mas_Latn", "mas_Latn_KE"}, - {"mn", "mn_Cyrl_MN"}, - {"mn_Cyrl", "mn_Cyrl_MN"}, - {"mro", "mro_Mroo_BD"}, - {"mro_BD", "mro_Mroo_BD"}, - {"ms_Arab", "ms_Arab_MY"}, - {"pap", "pap_Latn_CW"}, - {"pap_Latn", "pap_Latn_CW"}, - { - "rif", "rif_Latn_MA" - }, // https://unicode-org.atlassian.net/browse/CLDR-14962?focusedCommentId=165053 - {"rif_Latn", "rif_Latn_MA"}, - {"rif_Tfng", "rif_Tfng_MA"}, - {"rif_MA", "rif_Latn_MA"}, // Ibid - {"shi", "shi_Tfng_MA"}, - {"shi_Tfng", "shi_Tfng_MA"}, - {"shi_MA", "shi_Tfng_MA"}, - {"sr_Latn", "sr_Latn_RS"}, - {"ss", "ss_Latn_ZA"}, - {"ss_Latn", "ss_Latn_ZA"}, - {"swc", "swc_Latn_CD"}, - {"ti", "ti_Ethi_ET"}, - {"ti_Ethi", "ti_Ethi_ET"}, - {LocaleNames.UND, "en_Latn_US"}, - {"und_Adlm", "ff_Adlm_GN"}, - {"und_Adlm_GN", "ff_Adlm_GN"}, - {"und_Arab", "ar_Arab_EG"}, - {"und_Arab_PK", "ur_Arab_PK"}, - {"und_Bopo", "zh_Bopo_TW"}, - {"und_Deva_FJ", "hif_Deva_FJ"}, - {"und_EZ", "de_Latn_EZ"}, - {"und_Hani", "zh_Hani_CN"}, - {"und_Hani_CN", "zh_Hani_CN"}, - {"und_Kana", "ja_Kana_JP"}, - {"und_Kana_JP", "ja_Kana_JP"}, - {"und_Latn", "en_Latn_US"}, - {"und_001", "en_Latn_US"}, // to not be overridden by tok_Latn_001 - {"und_Latn_001", "en_Latn_US"}, // to not be overridden by tok_Latn_001 - {"und_Latn_ET", "en_Latn_ET"}, - {"und_Latn_NE", "ha_Latn_NE"}, - {"und_Latn_PH", "fil_Latn_PH"}, - {"und_ML", "bm_Latn_ML"}, - {"und_Latn_ML", "bm_Latn_ML"}, - {"und_MU", "mfe_Latn_MU"}, - {"und_NE", "ha_Latn_NE"}, - {"und_PH", "fil_Latn_PH"}, - {"und_PK", "ur_Arab_PK"}, - {"und_SO", "so_Latn_SO"}, - {"und_SS", "en_Latn_SS"}, - {"und_TK", "tkl_Latn_TK"}, - {"und_UN", "en_Latn_UN"}, - {"und_005", "pt_Latn_BR"}, - {"vo", "vo_Latn_001"}, - {"vo_Latn", "vo_Latn_001"}, - {"yi", "yi_Hebr_001"}, - {"yi_Hebr", "yi_Hebr_001"}, - {"yue", "yue_Hant_HK"}, - {"yue_Hant", "yue_Hant_HK"}, - {"yue_Hans", "yue_Hans_CN"}, - {"yue_CN", "yue_Hans_CN"}, - {"zh_Hani", "zh_Hani_CN"}, - {"zh_Bopo", "zh_Bopo_TW"}, - {"ccp", "ccp_Cakm_BD"}, - {"ccp_Cakm", "ccp_Cakm_BD"}, - {"und_Cakm", "ccp_Cakm_BD"}, - {"cu_Glag", "cu_Glag_BG"}, - {"sd_Khoj", "sd_Khoj_IN"}, - {"lif_Limb", "lif_Limb_IN"}, - {"grc_Linb", "grc_Linb_GR"}, - {"arc_Nbat", "arc_Nbat_JO"}, - {"arc_Palm", "arc_Palm_SY"}, - {"pal_Phlp", "pal_Phlp_CN"}, - {"en_Shaw", "en_Shaw_GB"}, - {"sd_Sind", "sd_Sind_IN"}, - {"und_Brai", "fr_Brai_FR"}, // hack - {"und_Hanb", "zh_Hanb_TW"}, // Special script code - {"zh_Hanb", "zh_Hanb_TW"}, // Special script code - {"und_Jamo", "ko_Jamo_KR"}, // Special script code - - // {"und_Cyrl_PL", "be_Cyrl_PL"}, - - // {"cr", "cr_Cans_CA"}, - // {"hif", "hif_Latn_FJ"}, - // {"gon", "gon_Telu_IN"}, - // {"lzz", "lzz_Latn_TR"}, - // {"lif", "lif_Deva_NP"}, - // {"unx", "unx_Beng_IN"}, - // {"unr", "unr_Beng_IN"}, - // {"ttt", "ttt_Latn_AZ"}, - // {"pnt", "pnt_Grek_GR"}, - // {"tly", "tly_Latn_AZ"}, - // {"tkr", "tkr_Latn_AZ"}, - // {"bsq", "bsq_Bass_LR"}, - // {"ccp", "ccp_Cakm_BD"}, - // {"blt", "blt_Tavt_VN"}, - // { "mis_Medf", "mis_Medf_NG" }, - - {"ku_Yezi", "ku_Yezi_GE"}, - {"und_EU", "en_Latn_IE"}, - {"hnj", "hnj_Hmnp_US"}, // preferred lang/script in CLDR - {"hnj_Hmnp", "hnj_Hmnp_US"}, - {"und_Hmnp", "hnj_Hmnp_US"}, - {"rhg", "rhg_Rohg_MM"}, // preferred lang/script in CLDR - {"rhg_Arab", "rhg_Arab_MM"}, - {"und_Arab_MM", "rhg_Arab_MM"}, - {"sd_IN", "sd_Deva_IN"}, // preferred in CLDR - // { "sd_Deva", "sd_Deva_IN"}, - {"und_Cpmn", "und_Cpmn_CY"}, - {"oc_ES", "oc_Latn_ES"}, - {"os", "os_Cyrl_GE"}, - {"os_Cyrl", "os_Cyrl_GE"}, - }); - - /** - * The following supplements the suppress-script. It overrides info from exemplars and the - * locale info. - */ - private static String[][] SpecialScripts = { - {"zh", "Hans"}, // Hans (not Hani) - {"yue", "Hant"}, // Hans (not Hani) - {"chk", "Latn"}, // Chuukese (Micronesia) - {"fil", "Latn"}, // Filipino (Philippines)" - {"ko", "Kore"}, // Korean (North Korea) - {"ko_KR", "Kore"}, // Korean (North Korea) - {"pap", "Latn"}, // Papiamento (Netherlands Antilles) - {"pau", "Latn"}, // Palauan (Palau) - {"su", "Latn"}, // Sundanese (Indonesia) - {"tet", "Latn"}, // Tetum (East Timor) - {"tk", "Latn"}, // Turkmen (Turkmenistan) - {"ty", "Latn"}, // Tahitian (French Polynesia) - {"ja", "Jpan"}, // Special script for japan - {LocaleNames.UND, "Latn"}, // Ultimate fallback - }; - - private static Map localeToScriptCache = new TreeMap<>(); - - static { - for (String language : standardCodes.getAvailableCodes("language")) { - Map info = standardCodes.getLangData("language", language); - String script = info.get("Suppress-Script"); - if (script != null) { - localeToScriptCache.put(language, script); - } - } - for (String[] pair : SpecialScripts) { - localeToScriptCache.put(pair[0], pair[1]); - } - } - - private static Map FALLBACK_SCRIPTS; - - static { - LanguageTagParser additionLtp = new LanguageTagParser(); - Map _FALLBACK_SCRIPTS = new TreeMap<>(); - for (String addition : MAX_ADDITIONS) { - additionLtp.set(addition); - String lan = additionLtp.getLanguage(); - _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); - } - FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); - } - - private static int errorCount; - - public static void main(String[] args) throws IOException { - if (true) { - throw new IllegalArgumentException("Don't run this tool until it is fixed"); - } - - printDefaultLanguagesAndScripts(); - - Map toMaximized = new TreeMap<>(); - - tryDifferentAlgorithm(toMaximized); - - minimize(toMaximized); - - // HACK TEMP_UNKNOWN_REGION - // this is to get around the removal of items with ZZ in minimize. - // probably cleaner way to do it, but this provides control over just those we want to - // retain. - Set toRemove = new TreeSet<>(); - Map toFix = new TreeMap<>(); - for (Entry entry : toMaximized.entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - if (key.contains(TEMP_UNKNOWN_REGION)) { - toRemove.add(key); - } else if (value.contains(TEMP_UNKNOWN_REGION)) { - toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); - } - } - for (String key : toRemove) { - toMaximized.remove(key); - } - toMaximized.putAll(toFix); - - Map oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); - Set changes = - compareMapsAndFixNew( - "*WARNING* Likely Subtags: ", - oldLikely, - toMaximized, - "ms_Arab", - "ms_Arab_ID"); - System.out.println(Joiner.on("\n").join(changes)); - - if (OUTPUT_STYLE == OutputStyle.C_ALT) { - doAlt(toMaximized); - } - - if (SHOW_ADD) - System.out.println( - "/*" - + CldrUtility.LINE_SEPARATOR - + " To Maximize:" - + CldrUtility.LINE_SEPARATOR - + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." - + CldrUtility.LINE_SEPARATOR - + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." - + CldrUtility.LINE_SEPARATOR - + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" - + CldrUtility.LINE_SEPARATOR - + " Try each of the following in order (where the field exists)" - + CldrUtility.LINE_SEPARATOR - + " Lookup language-script-region. If in the table, return the result + variants" - + CldrUtility.LINE_SEPARATOR - + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" - + CldrUtility.LINE_SEPARATOR - + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" - + CldrUtility.LINE_SEPARATOR - + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" - + CldrUtility.LINE_SEPARATOR - + CldrUtility.LINE_SEPARATOR - + " Example: Input is zh-ZZZZ-SG." - + CldrUtility.LINE_SEPARATOR - + " Normalize to zh-SG. Lookup in table. No match." - + CldrUtility.LINE_SEPARATOR - + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." - + CldrUtility.LINE_SEPARATOR - + CldrUtility.LINE_SEPARATOR - + " To Minimize:" - + CldrUtility.LINE_SEPARATOR - + " First get max = maximize(input)." - + CldrUtility.LINE_SEPARATOR - + " Then for trial in {language, language-region, language-script}" - + CldrUtility.LINE_SEPARATOR - + " If maximize(trial) == max, then return trial." - + CldrUtility.LINE_SEPARATOR - + " If you don't get a match, return max." - + CldrUtility.LINE_SEPARATOR - + CldrUtility.LINE_SEPARATOR - + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." - + CldrUtility.LINE_SEPARATOR - + " zh => zh-Hans-CN. No match, so continue." - + CldrUtility.LINE_SEPARATOR - + " zh-TW => zh-Hans-TW. Match, so return zh-TW." - + CldrUtility.LINE_SEPARATOR - + CldrUtility.LINE_SEPARATOR - + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." - + CldrUtility.LINE_SEPARATOR - + " toMaximal size:\t" - + toMaximized.size() - + CldrUtility.LINE_SEPARATOR - + "*/"); - - final File newLikelySubtags = printLikelySubtags(toMaximized); - - printDefaultContent(toMaximized); - - // Do this here so the two "Copying…" messages show up together. - if (OUTPUT_STYLE == OutputStyle.XML) { - final File oldLikelySubtags = - CLDRConfig.getInstance().getEnglish().getSupplementalFile("likelySubtags.xml"); - System.out.println("Copying " + newLikelySubtags + " to " + oldLikelySubtags); - oldLikelySubtags.delete(); - Files.copy(newLikelySubtags.toPath(), oldLikelySubtags.toPath()); - System.err.println("TODO: Please revert removal of 'sil1' entries, see CLDR-16380"); - } - - System.out.println( - CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); - - System.exit(errorCount > 0 ? 1 : 0); - } - - static class RowData implements Comparable { - OfficialStatus os; - String name; - Long pop; - - public RowData(OfficialStatus os, String name, Long pop) { - this.os = os; - this.name = name; - this.pop = pop; - } - - public OfficialStatus getStatus() { - // TODO Auto-generated method stub - return os; - } - - public CharSequence getName() { - // TODO Auto-generated method stub - return name; - } - - public Long getLiteratePopulation() { - // TODO Auto-generated method stub - return pop; - } - - @Override - public int compareTo(RowData o) { - // TODO Auto-generated method stub - int result = os.compareTo(o.os); - if (result != 0) return -result; - long result2 = pop - o.pop; - if (result2 != 0) return result2 < 0 ? 1 : -1; - return name.compareTo(o.name); - } - - @Override - public boolean equals(Object o) { - return 0 == compareTo((RowData) o); - } - - @Override - public int hashCode() { - throw new UnsupportedOperationException(); - } - } - - private static void printDefaultLanguagesAndScripts() { - - final int minTotalPopulation = 10000000; - final int minTerritoryPopulation = 1000000; - final double minTerritoryPercent = 1.0 / 3; - Map> languageToReason = new TreeMap<>(); - Counter languageToLiteratePopulation = new Counter<>(); - NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); - nf.setGroupingUsed(true); - LanguageTagParser ltp = new LanguageTagParser(); - LikelySubtags likelySubtags = new LikelySubtags(); - /* - * A. X is a qualified language**, and at least one of the following is true: - * - * 1. X is has official status* in any country - * 2. X exceeds a threshold population† of literate users worldwide: 1M - * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†. - * - * B. X is an exception explicitly approved by the committee or X has minimal - * language coverage‡ in CLDR itself. - * C. The language is in the CLDR-target locales - */ - OfficialStatus minimalStatus = - OfficialStatus.official_regional; // OfficialStatus.de_facto_official; - Map languages = new TreeMap<>(); - for (String language : standardCodes.getAvailableCodes("language")) { - String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); - String result = english.getStringValue(path); - if (result != null) { - languages.put(language, result); - } - } - - if (SHOW_ALL_LANGUAGE_CODES) { - for (String language : languages.keySet()) { - System.out.println(language + "\t" + languages.get(language)); - } - } else { - System.out.println( - "- GenerateMaximalLocales.java: SHOW_ALL_LANGUAGE_CODES=true to show all language codes"); - } - - // also CLDR-target locales - final Set CLDRMainLanguages = - new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr)); - - for (String territory : supplementalData.getTerritoriesWithPopulationData()) { - if (Iso3166Data.isRegionCodeNotForTranslation(territory)) { - System.out.println( - "Iso3166Data.isRegionCodeNotForTranslation(" - + territory - + ") true, skipping"); - continue; - } - PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); - double territoryPopulation = territoryPop.getLiteratePopulation(); - for (String languageScript : - supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { - PopulationData popData = - supplementalData.getLanguageAndTerritoryPopulationData( - languageScript, territory); - ltp.set(languageScript); - String language = ltp.getLanguage(); - // if (ltp.getScript().isEmpty()) { - // String max = likelySubtags.maximize(languageScript); - // if (max != null) { - // ltp.set(max).setRegion(""); - // languageScript = ltp.toString(); - // } - // } - boolean add = false; - // #1 - OfficialStatus status = popData.getOfficialStatus(); - if (status.compareTo(minimalStatus) >= 0) { - add = true; - } - long literatePopulation = getWritingPopulation(popData); - // #2 - languageToLiteratePopulation.add(language, literatePopulation); - // #3 - if (literatePopulation > minTerritoryPopulation - && literatePopulation > minTerritoryPercent * territoryPopulation) { - add = true; - } - if (add == false && CLDRMainLanguages.contains(language)) { - add = true; - } - if (add) { - add(languageToReason, language, territory, status, literatePopulation); - Set containers = Containment.leafToContainer(territory); - if (containers == null) { - throw new NullPointerException( - "Containment.leafToContainer(" + territory + ") is null"); - } - // Add the containing regions - for (String container : containers) { - add( - languageToReason, - language, - container, - OfficialStatus.unknown, - literatePopulation); - } - } - } - } - // #2, now that we have the data - for (String language : languageToLiteratePopulation.keySet()) { - long totalPop = languageToLiteratePopulation.getCount(language); - if (totalPop > minTotalPopulation) { - add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); - } - } - - // Specials - add(languageToReason, LocaleNames.UND, "001", OfficialStatus.unknown, 0); - - // for (String language : Iso639Data.getAvailable()) { - // Scope scope = Iso639Data.getScope(language); - // Type type = Iso639Data.getType(language); - // if (scope == Scope.Special) { - // add(languageToReason, language, "001", OfficialStatus.unknown, -1); - // } - // } - // print them - - System.out.println("Detailed - Including:\t" + languageToReason.size()); - - if (!SHOW_DETAILED) { - System.out.println( - "- GenerateMaximalLocales.java: SHOW_DETAILED=true to show more details"); - } else { - for (String language : languageToReason.keySet()) { - Set reasons = languageToReason.get(language); - - RowData lastReason = reasons.iterator().next(); - - System.out - .append(language) - .append("\t") - .append(english.getName(language)) - .append("\t") - .append(lastReason.getStatus().toShortString()) - .append("\t") - .append(nf.format(languageToLiteratePopulation.getCount(language))); - for (RowData reason : reasons) { - String status = reason.getStatus().toShortString(); - System.out - .append("\t") - .append(status) - .append("-") - .append(reason.getName()) - .append("-") - .append(nf.format(reason.getLiteratePopulation())); - } - System.out.append("\n"); - } - } - - // now list them - - Set others = new TreeSet<>(); - others.addAll(standardCodes.getGoodAvailableCodes("language")); - others.removeAll(languageToReason.keySet()); - System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); - if (SHOW_INCLUDED_EXCLUDED) { - showLanguages(languageToReason.keySet(), languageToReason); - } - System.out.println("\nExcluded Languages:\t" + others.size()); - if (SHOW_INCLUDED_EXCLUDED) { - showLanguages(others, languageToReason); - } else { - System.out.println( - " - GenerateMaximalLocales.java: set SHOW_INCLUDED_EXCLUDED=true to show reason details"); - } - } - - private static long getWritingPopulation(PopulationData popData) { - final double writingPopulation = popData.getWritingPopulation(); - if (!Double.isNaN(writingPopulation)) { - return (long) writingPopulation; - } - return (long) popData.getLiteratePopulation(); - } - - private static void showLanguages( - Set others, Map> languageToReason) { - Set sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH)); - for (String language : others) { - sorted.add(getLanguageName(language, languageToReason)); - } - char last = 0; - for (String language : sorted) { - final char curr = language.charAt(0); - if (last != curr) { - System.out.println(); - } else if (last != '\u0000') { - System.out.print(", "); - } - System.out.print(language); - last = curr; - } - System.out.println(); - } - - private static String getLanguageName( - String language, Map> languageToReason) { - OfficialStatus best = OfficialStatus.unknown; - Set reasons = languageToReason.get(language); - if (reasons != null) { - for (RowData reason : reasons) { - final OfficialStatus currentStatus = reason.getStatus(); - if (best.compareTo(currentStatus) < 0) { - best = currentStatus; - } - } - } - String status = best.toShortString(); - Scope scope = Iso639Data.getScope(language); - if (scope == Scope.Special) { - status = "S"; - } - String languageFormatted = english.getName(language) + " [" + language + "]-" + status; - return languageFormatted; - } - - private static void add( - Map> languageToReason, - String language, - String territoryRaw, - OfficialStatus status, - long population) { - String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; - Set set = languageToReason.get(language); - if (set == null) { - languageToReason.put(language, set = new TreeSet<>()); - } - set.add(new RowData(status, territory, population)); - } - - /** In computing the defaultContents, no and nb require special handling. */ - static final Map SPECIAL_CHILD_TO_PARENT = - ImmutableMap.of("nb", "no", "nb_NO", "nb"); - - /* - * Compute the defaultContent values for supplemental data. - * It uses the maximization data and the simpleParent (truncation). - * We can't use the normal "getParent" because that messes up the logic - * used to handle inconsistencies in scripts in CLDR.
- * That is, there are three situations:
    - *
  • all children have explicit scripts;
  • - *
  • no children have scripts; and
  • - *
  • some do and some don't
- */ - - private static void printDefaultContent(Map toMaximized) throws IOException { - - Set defaultLocaleContent = new TreeSet<>(); - - // go through all the cldr locales, and add default contents - // now computed from toMaximized - Set available = factory.getAvailable(); - Relation toSimpleChildren = - Relation.of(new TreeMap>(), TreeSet.class); - LanguageTagParser ltp = new LanguageTagParser(); - - // System.out.println(maximize("az_Latn_AZ", toMaximized)); - Set hasSimpleChildWithScript = new TreeSet<>(); - - // first get a mapping to children - for (String locale : available) { - if (locale.equals(LocaleNames.ROOT)) { - continue; - } - if (ltp.set(locale).getVariants().size() != 0) { - continue; - } - String parent = SPECIAL_CHILD_TO_PARENT.get(locale); - if (parent == null) { - parent = - LocaleIDParser.getSimpleParent( - locale); // we can't use the regular getParent (see above) - } - - if (ltp.getScript().length() != 0) { - hasSimpleChildWithScript.add(parent); - } - if (parent.equals(LocaleNames.ROOT)) { - continue; - } - toSimpleChildren.put(parent, locale); - } - - // Suppress script for locales for which we only have one locale in common/main. See ticket - // #7834. - Set suppressScriptLocales = - new HashSet<>( - Arrays.asList( - "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", "byn_ER", - "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", - "sw_TZ", "wo_SN", "yo_NG", "dje_NE", "blt_VN", "hi_IN", "nv_US", - "doi_IN")); - - // if any have a script, then throw out any that don't have a script (unless they're - // specifically included.) - Set toRemove = new TreeSet<>(); - for (String locale : hasSimpleChildWithScript) { - toRemove.clear(); - Set children = toSimpleChildren.getAll(locale); - for (String child : children) { - if (ltp.set(child).getScript().length() == 0 - && !suppressScriptLocales.contains(child)) { - toRemove.add(child); - } - } - if (toRemove.size() != 0) { - System.out.println( - "\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); - toSimpleChildren.removeAll(locale, toRemove); - } - } - - // we add a child as a default locale if it has the same maximization - main: - for (String locale : toSimpleChildren.keySet()) { - String maximized = maximize(locale, toMaximized); - if (maximized == null) { - if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); - continue; - } - Set children = toSimpleChildren.getAll(locale); - Map debugStuff = new TreeMap<>(); - for (String child : children) { - String maximizedChild = maximize(child, toMaximized); - if (maximized.equals(maximizedChild)) { - defaultLocaleContent.add(child); - continue main; - } - debugStuff.put(child, maximizedChild); - } - if (SHOW_ADD) - System.out.println( - "Can't find maximized: " - + locale - + "=" - + maximized - + "\tin\t" - + debugStuff); - } - - for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) { - defaultLocaleContent.add(specialChild); - } - defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox) - defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale. - - showDefaultContentDifferencesAndFix(defaultLocaleContent); - - final File genSuppDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental"); - final File genSuppMetadataFile = new File(genSuppDir, "supplementalMetadata.xml"); - final File oldSuppMetadataFile = - new File(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); - - try (PrintWriter genFile = FileUtilities.openUTF8Writer(genSuppMetadataFile); - BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSuppMetadataFile); ) { - CldrUtility.copyUpTo( - oldFile, - PatternCache.get("\\s*"); - - // genFile.println(""); - CldrUtility.copyUpTo( - oldFile, - PatternCache.get("\\s*/>\\s*(" - + CldrUtility.LINE_SEPARATOR - + "" - + CldrUtility.LINE_SEPARATOR - + "" - + CldrUtility.LINE_SEPARATOR - + " " - + CldrUtility.LINE_SEPARATOR - + " "; - String footer = - OUTPUT_STYLE != OutputStyle.XML - ? SEPARATOR + "};" - : " " - + CldrUtility.LINE_SEPARATOR - + ""; - out.println(header); - boolean first = true; - Set keys = new TreeSet<>(new LocaleStringComparator()); - keys.addAll(fluffup.keySet()); - for (String printingLocale : keys) { - String printingTarget = fluffup.get(printingLocale); - String comment = - printingName(printingLocale, spacing) - + spacing - + "=>" - + spacing - + printingName(printingTarget, spacing); - - if (OUTPUT_STYLE == OutputStyle.XML) { - out.println( - "\t\t" - + CldrUtility.LINE_SEPARATOR - + "\t\t" - + ""); - } else { - if (first) { - first = false; - } else { - out.print(","); - } - if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { - comment = - printingName(printingLocale, spacing) - + SEPARATOR - + " // " - + spacing - + "=>" - + spacing - + printingName(printingTarget, spacing); - } - out.print( - " {" - + SEPARATOR - + " // " - + comment - + SEPARATOR - + " \"" - + printingLocale - + "\"," - + SEPARATOR - + " \"" - + printingTarget - + "\"" - + CldrUtility.LINE_SEPARATOR - + " }"); - } - } - out.println(footer); - out.close(); - } - return genFile; - } - - public static String printingName(String locale, String spacing) { - if (locale == null) { - return null; - } - LanguageTagParser parser = new LanguageTagParser().set(locale); - String lang = parser.getLanguage(); - String script = parser.getScript(); - String region = parser.getRegion(); - return "{" - + spacing - + (lang.equals(LocaleNames.UND) - ? "?" - : english.getName(CLDRFile.LANGUAGE_NAME, lang)) - + ";" - + spacing - + (script == null || script.equals("") - ? "?" - : english.getName(CLDRFile.SCRIPT_NAME, script)) - + ";" - + spacing - + (region == null || region.equals("") - ? "?" - : english.getName(CLDRFile.TERRITORY_NAME, region)) - + spacing - + "}"; - } - - private static final String[][] ALT_REVERSAL = { - // { "no", "nb" }, - // { "nb", "no" }, - {"he", "iw"}, - {"iw", "he"}, - }; - - public static String toAlt(String locale, boolean change) { - if (!change || locale == null) { - return locale; - } - String firstTag = getFirstTag(locale); - for (String[] pair : ALT_REVERSAL) { - if (firstTag.equals(pair[0])) { - locale = pair[1] + locale.substring(pair[1].length()); - break; - } - } - locale = locale.replace("_", "-"); - return locale; - } - - private static String getFirstTag(String locale) { - int pos = locale.indexOf('_'); - return pos < 0 ? locale : locale.substring(0, pos); - } - - // private static Map getBackMapping(Map fluffup) { - // Relation backMap = new Relation(new TreeMap(), TreeSet.class, - // BEST_LANGUAGE_COMPARATOR); - // for (String source : fluffup.keySet()) { - // if (source.startsWith(LocaleNames.UND)) { - // continue; - // } - // String maximized = fluffup.get(source); - // backMap.put(maximized, source); // put in right order - // } - // Map returnBackMap = new TreeMap(); - // for (String maximized : backMap.keySet()) { - // final Set all = backMap.getAll(maximized); - // final String minimized = all.iterator().next(); - // returnBackMap.put(maximized, minimized); - // } - // return returnBackMap; - // } - - /** - * Language tags are presumed to share the first language, except possibly LocaleNames.UND. Best - * is least - */ - // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator() { - // LanguageTagParser p1 = new LanguageTagParser(); - // LanguageTagParser p2 = new LanguageTagParser(); - // public int compare(String o1, String o2) { - // if (o1.equals(o2)) return 0; - // p1.set(o1); - // p2.set(o2); - // String lang1 = p1.getLanguage(); - // String lang2 = p2.getLanguage(); - // - // // compare languages first - // // put und at the end - // int result = lang1.compareTo(lang2); - // if (result != 0) { - // if (lang1.equals(LocaleNames.UND)) return 1; - // if (lang2.equals(LocaleNames.UND)) return -1; - // return result; - // } - // - // // now scripts and regions. - // // if they have different numbers of fields, the shorter wins. - // // If there are two fields, region is lowest. - // // The simplest way is to just compare scripts first - // // so zh-TW < zh-Hant, because we first compare "" to Hant - // String script1 = p1.getScript(); - // String script2 = p2.getScript(); - // int scriptOrder = script1.compareTo(script2); - // if (scriptOrder != 0) return scriptOrder; - // - // String region1 = p1.getRegion(); - // String region2 = p2.getRegion(); - // int regionOrder = region1.compareTo(region2); - // if (regionOrder != 0) return regionOrder; - // - // return o1.compareTo(o2); - // } - // - // }; - - public static void minimize(Map fluffup) { - LanguageTagParser parser = new LanguageTagParser(); - LanguageTagParser targetParser = new LanguageTagParser(); - Set removals = new TreeSet<>(); - while (true) { - removals.clear(); - for (String locale : fluffup.keySet()) { - String target = fluffup.get(locale); - if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Region in target"); - continue; - } - if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Script in target"); - continue; - } - - String region = parser.set(locale).getRegion(); - if (region.length() != 0) { - if (region.equals(UNKNOWN_REGION)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Region in source"); - continue; - } - parser.setRegion(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setRegion(region).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\tRedundant with " - + newLocale); - continue; - } - } - } - String script = parser.set(locale).getScript(); - if (locale.equals(DEBUG_ADD_KEY)) { - System.out.println("*debug*"); - } - if (script.length() != 0) { - if (script.equals(UNKNOWN_SCRIPT)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\t - Unknown Script"); - continue; - } - parser.setScript(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setScript(script).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\tRedundant with " - + newLocale); - continue; - } - } - } - } - if (removals.size() == 0) { - break; - } - for (String locale : removals) { - fluffup.remove(locale); - } - } - } - - // private static void addLanguageScript(Map fluffup, LanguageTagParser parser) - // { - // // add script - // Map temp = new TreeMap(); - // while (true) { - // temp.clear(); - // for (String target : new TreeSet(fluffup.values())) { - // parser.set(target); - // final String territory = parser.getRegion(); - // if (territory.length() == 0) { - // continue; - // } - // parser.setRegion(""); - // String possibleSource = parser.toString(); - // if (fluffup.containsKey(possibleSource)) { - // continue; - // } - // String other = temp.get(possibleSource); - // if (other != null) { - // if (!target.equals(other)) { - // System.out.println("**Failure with multiple sources in addLanguageScript: " - // + possibleSource + "\t=>\t" + target + ", " + other); - // } - // continue; - // } - // temp.put(possibleSource, target); - // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + - // "\t\tLanguage-Script"); - // } - // if (temp.size() == 0) { - // break; - // } - // fluffup.putAll(temp); - // } - // - // } - - // private static void addLanguageCountry(Map fluffup, LanguageTagParser parser) - // { - // // add script - // Map temp = new TreeMap(); - // while (true) { - // temp.clear(); - // for (String target : new TreeSet(fluffup.values())) { - // parser.set(target); - // String script = parser.getScript(); - // if (script.length() == 0) { - // continue; - // } - // parser.setScript(""); - // String possibleSource = parser.toString(); - // if (fluffup.containsKey(possibleSource)) { - // continue; - // } - // String other = temp.get(possibleSource); - // - // if (other != null) { - // if (!target.equals(other)) { - // script = getScriptForLocale(possibleSource); - // if (script == null) { - // System.out.println("**Failure with multiple sources in addLanguageCountry: " - // + possibleSource + "\t=>\t" + target + ", " + other); - // continue; // error message in routine - // } - // parser.setScript(script); - // target = parser.toString(); - // } - // } - // - // temp.put(possibleSource, target); - // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + - // "\t\tLanguageCountry"); - // } - // if (temp.size() == 0) { - // break; - // } - // fluffup.putAll(temp); - // } - // - // } - - // private static void addScript(Map fluffup, LanguageTagParser parser) { - // // add script - // Map temp = new TreeMap(); - // while (true) { - // temp.clear(); - // Set skipTarget = fluffup.keySet(); - // for (String locale : fluffup.keySet()) { - // String target = fluffup.get(locale); - // parser.set(target); - // if (parser.getScript().length() != 0) { - // continue; - // } - // String script = getScriptForLocale(target); - // - // if (script == null) { - // continue; // error message in routine - // } - // parser.setScript(script); - // String furtherTarget = parser.toString(); - // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); - // } - // if (temp.size() == 0) { - // break; - // } - // fluffup.putAll(temp); - // } - // } - - // private static String getScriptForLocale(String locale) { - // String result = getScriptForLocale2(locale); - // if (result != null) return result; - // int pos = locale.indexOf('_'); - // if (pos >= 0) { - // result = getScriptForLocale2(locale.substring(0,pos)); - // } - // return result; - // } - - private static String UNKNOWN_SCRIPT = "Zzzz"; - private static String UNKNOWN_REGION = "ZZ"; - - private static String getScriptForLocale2(String locale) { - String result = localeToScriptCache.get(locale); - if (result != null) { - return result; - } - if (locale.equals("ky")) { - int debug = 0; - } - try { - Map data = supplementalData.getBasicLanguageDataMap(locale); - if (data != null) { - for (BasicLanguageData datum : data.values()) { - final Set scripts = datum.getScripts(); - boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; - if (scripts.size() != 1) { - if (scripts.size() > 1 && isPrimary) { - break; - } - continue; - } - String script = scripts.iterator().next(); - if (isPrimary) { - return result = script; - } else if (result == null) { - result = script; - } - } - if (result != null) { - return result; - } - } - CLDRFile cldrFile; - try { - cldrFile = factory.make(locale, true); - } catch (RuntimeException e) { - result = FALLBACK_SCRIPTS.get(locale); - if (result == null) { - System.err.println( - "***Failed to find script in L-S-R or MAX_ADDITIONS for: " - + locale - + "\t" - + english.getName(locale)); - return result = UNKNOWN_SCRIPT; - } else { - return result; - } - } - UnicodeSet exemplars = getExemplarSet(cldrFile, ""); - Set CLDRScripts = getScriptsFromUnicodeSet(exemplars); - CLDRScripts.remove(UNKNOWN_SCRIPT); - if (CLDRScripts.size() == 1) { - return result = CLDRScripts.iterator().next(); - } else if (CLDRScripts.size() == 0) { - System.out.println("**Failed to get script for:\t" + locale); - return result = UNKNOWN_SCRIPT; - } else { - System.out.println( - "**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); - return result = UNKNOWN_SCRIPT; - } - } finally { - if (result.equals(UNKNOWN_SCRIPT)) { - String temp = LANGUAGE_OVERRIDES.get(locale); - if (temp != null) { - result = new LanguageTagParser().set(temp).getScript(); - System.err.println( - "***Warning, Getting script from LANGUAGE_OVERRIDES for " - + locale - + " => " - + result); - } - } - localeToScriptCache.put(locale, result); - if (SHOW_ADD) - System.out.println( - "Script:\t" - + locale - + "\t" - + english.getName(locale) - + "\t=>\t" - + result - + "\t" - + english.getName(CLDRFile.SCRIPT_NAME, result)); - } - } - - // private static Map closeMapping(Map fluffup) { - // if (SHOW_ADD) System.out.flush(); - // Map temp = new TreeMap(); - // while (true) { - // temp.clear(); - // for (String locale : fluffup.keySet()) { - // String target = fluffup.get(locale); - // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { - // System.out.println("????"); - // } - // String furtherTarget = fluffup.get(target); - // if (furtherTarget == null) { - // continue; - // } - // addIfNotIn(locale, furtherTarget, temp, null, "Close"); - // } - // if (temp.size() == 0) { - // break; - // } - // fluffup.putAll(temp); - // } - // if (SHOW_ADD) System.out.flush(); - // return temp; - // } - - public static Set getScriptsFromUnicodeSet(UnicodeSet exemplars) { - // use bits first, since that's faster - BitSet scriptBits = new BitSet(); - boolean show = false; - for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next(); ) { - if (show) System.out.println(Integer.toHexString(it.codepoint)); - if (it.codepoint != UnicodeSetIterator.IS_STRING) { - scriptBits.set(UScript.getScript(it.codepoint)); - } else { - int cp; - for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { - scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); - } - } - } - scriptBits.clear(UScript.COMMON); - scriptBits.clear(UScript.INHERITED); - Set scripts = new TreeSet<>(); - for (int j = 0; j < scriptBits.size(); ++j) { - if (scriptBits.get(j)) { - scripts.add(UScript.getShortName(j)); - } - } - return scripts; - } - - public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { - if (type.length() != 0) type = "[@type=\"" + type + "\"]"; - String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" + type); - if (v == null) return new UnicodeSet(); - return new UnicodeSet(v); - } - - // private static String[][] SpecialCases = { - // { "zh_Hani", "zh_Hans_CN"}, - // { "si_Sinh", "si_Sinh_LK"}, - // { "ii", "ii_CN"}, // Sichuan Yi (Yi) - // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) - // { LocaleNames.UND, "en"}, // English default - // }; - - static void showDefaultContentDifferencesAndFix(Set defaultLocaleContent) { - Set errors = new LinkedHashSet<>(); - Map oldDefaultContent = - SupplementalDataInfo.makeLocaleToDefaultContents( - ConvertLanguageData.supplementalData.getDefaultContentLocales(), - new TreeMap(), - errors); - if (!errors.isEmpty()) { - System.out.println(Joiner.on("\n").join(errors)); - errors.clear(); - } - Map newDefaultContent = - SupplementalDataInfo.makeLocaleToDefaultContents( - defaultLocaleContent, new TreeMap(), errors); - if (!errors.isEmpty()) { - System.out.println("Default Content errors: " + Joiner.on("\n").join(errors)); - errors.clear(); - } - Set changes = - compareMapsAndFixNew( - "*WARNING* Default Content: ", - oldDefaultContent, - newDefaultContent, - "ar", - "ar_001"); - System.out.println(Joiner.on("\n").join(changes)); - defaultLocaleContent.clear(); - defaultLocaleContent.addAll(newDefaultContent.values()); - newDefaultContent = - SupplementalDataInfo.makeLocaleToDefaultContents( - defaultLocaleContent, new TreeMap(), errors); - if (!errors.isEmpty()) { - System.out.println("***New Errors: " + Joiner.on("\n").join(errors)); - } - } - - private static Set compareMapsAndFixNew( - String title, - Map oldContent, - Map newContent, - String... allowedOverrideValues) { - Map allowedOverrideValuesTest = new HashMap<>(); - for (int i = 0; i < allowedOverrideValues.length; i += 2) { - allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); - } - Set changes = new TreeSet<>(); - for (String parent : - Builder.with(new TreeSet()) - .addAll(newContent.keySet()) - .addAll(oldContent.keySet()) - .get()) { - String oldValue = oldContent.get(parent); - String newValue = newContent.get(parent); - String overrideValue = allowedOverrideValuesTest.get(parent); - if (overrideValue != null) { - newContent.put(parent, overrideValue); - newValue = overrideValue; - } - if (CldrUtility.equals(oldValue, newValue)) { - continue; - } - String message; - if (oldValue == null) { - message = - "Adding " - + ConvertLanguageData.getLanguageCodeAndName(parent) - + " => " - + ConvertLanguageData.getLanguageCodeAndName(newValue); - newContent.put(parent, newValue); - } else if (newValue == null) { - if (SUPPRESS_CHANGES) { - message = - "Suppressing removal of " - + ConvertLanguageData.getLanguageCodeAndName(parent) - + " => " - + ConvertLanguageData.getLanguageCodeAndName(oldValue); - newContent.put(parent, oldValue); - } else { - message = - "Removing " - + ConvertLanguageData.getLanguageCodeAndName(parent) - + " => " - + ConvertLanguageData.getLanguageCodeAndName(oldValue); - newContent.remove(oldValue); - } - } else { - if (SUPPRESS_CHANGES) { - message = - "Suppressing change of " - + ConvertLanguageData.getLanguageCodeAndName(parent) - + " => " - + ConvertLanguageData.getLanguageCodeAndName(oldValue) - + " to " - + ConvertLanguageData.getLanguageCodeAndName(newValue); - newContent.remove(newValue); - newContent.put(parent, oldValue); - } else { - message = - "Changing " - + ConvertLanguageData.getLanguageCodeAndName(parent) - + " => " - + ConvertLanguageData.getLanguageCodeAndName(oldValue) - + " to " - + ConvertLanguageData.getLanguageCodeAndName(newValue); - newContent.remove(oldValue); - newContent.put(parent, newValue); - } - } - changes.add(title + message); - } - return changes; - } - - public static class LocaleStringComparator implements Comparator { - LanguageTagParser ltp0 = new LanguageTagParser(); - LanguageTagParser ltp1 = new LanguageTagParser(); - - @Override - public int compare(String arg0, String arg1) { - ltp0.set(arg0); - ltp1.set(arg1); - String s0 = ltp0.getLanguage(); - String s1 = ltp1.getLanguage(); - int result = s0.compareTo(s1); - if (result != 0) { - return s0.equals(LocaleNames.UND) ? 1 : s1.equals(LocaleNames.UND) ? -1 : result; - } - s0 = ltp0.getScript(); - s1 = ltp1.getScript(); - result = s0.compareTo(s1); - if (result != 0) { - return result; - } - s0 = ltp0.getRegion(); - s1 = ltp1.getRegion(); - result = s0.compareTo(s1); - if (result != 0) { - return result; - } - return arg0.compareTo(arg1); // just in case - } - } -} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateProductionData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateProductionData.java index 786515b6069..83e03d9e2e2 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateProductionData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateProductionData.java @@ -7,12 +7,12 @@ import com.google.common.collect.Multimap; import com.google.common.collect.Sets; import com.google.common.io.Files; +import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.Output; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; -import java.io.UncheckedIOException; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; @@ -530,7 +530,7 @@ private static boolean copyFilesAndReturnIsEmpty( stats.retained += toRetain.size(); stats.remaining += count; } catch (FileNotFoundException e) { - throw new UncheckedIOException( + throw new ICUUncheckedIOException( "Can't copy " + sourceFile + " to " + destinationFile + " — ", e); } return !gotOne; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java index faafdc93e88..1db08be57da 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java @@ -6,10 +6,10 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; +import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.Output; import com.ibm.icu.util.ULocale; import java.io.IOException; -import java.io.UncheckedIOException; import java.math.BigInteger; import java.math.MathContext; import java.nio.file.Files; @@ -205,7 +205,7 @@ public void generateUnitLocalePreferences() { formatLocaleLine( "byte-per-millisecond", Rational.of(123), "default", "en", "", seen); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowPlurals.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowPlurals.java index ea529cc3aca..51403c1608d 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowPlurals.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowPlurals.java @@ -14,7 +14,6 @@ import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; -import java.io.UncheckedIOException; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; @@ -53,7 +52,7 @@ public static void main(String[] args) { try { new ShowPlurals().printPlurals(english, null, pw, cldrFactory); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/VerifyConverterResults.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/VerifyConverterResults.java index b1304d0a4c9..2782cc05a0c 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/VerifyConverterResults.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/VerifyConverterResults.java @@ -5,10 +5,10 @@ import com.google.common.collect.TreeMultimap; import com.google.gson.JsonElement; import com.google.gson.JsonStreamParser; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; import java.io.IOException; import java.io.Reader; -import java.io.UncheckedIOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Map.Entry; @@ -277,7 +277,7 @@ private static void processJson(File target, Set accummulatedValues) { JsonStreamParser gsonParser = new JsonStreamParser(reader); gsonParser.forEachRemaining((JsonElement x) -> process(x, accummulatedValues)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java index 92cef19658c..2bcee0f7dd9 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java @@ -15,7 +15,6 @@ import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; import java.io.IOException; -import java.io.UncheckedIOException; import java.io.Writer; import java.util.Arrays; import java.util.Collection; @@ -927,7 +926,7 @@ public static T showTransliterator( } } } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } return output; } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/FileReaders.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/FileReaders.java index e60d3cc2e6a..bfe64fab2e9 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/FileReaders.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/FileReaders.java @@ -1,5 +1,6 @@ package org.unicode.cldr.util; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -7,7 +8,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.UncheckedIOException; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -105,7 +105,7 @@ public String next() { try { return bufferedReader.readLine(); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocalePathValueListMatcher.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocalePathValueListMatcher.java index 4fc22cc4fdd..80f9dd4e1e6 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocalePathValueListMatcher.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocalePathValueListMatcher.java @@ -2,8 +2,8 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -70,7 +70,7 @@ public static LocalePathValueListMatcher load(Path path) { try { return load(Files.lines(path)); } catch (IOException ex) { - throw new UncheckedIOException(ex); + throw new ICUUncheckedIOException(ex); } } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDataTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDataTest.java index c339eaa262f..3d51ece92c4 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDataTest.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDataTest.java @@ -2,8 +2,8 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; @@ -52,7 +52,7 @@ public void testPersonNameTests() { try (DirectoryStream filepath = Files.newDirectoryStream(PERSON_NAMES_DIR)) { filepath.forEach(x -> checkPersonNameTests(x)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } FILES_CHECKED.add(PERSON_NAMES_DIR); } @@ -217,7 +217,7 @@ void checkLine(String line) { try (DirectoryStream filepath = Files.newDirectoryStream(Paths.get(TEST_DATA_DIR))) { filepath.forEach(x -> checkDirectories(x, missing)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } // TODO once we get full tests, turn this on // if (!assertEquals("Files all tested", 0, missing.size())) { @@ -241,7 +241,7 @@ private void checkDirectories(Path filepath, Set missing) { try (DirectoryStream filepath2 = Files.newDirectoryStream(filepath)) { filepath2.forEach(x -> checkDirectories(x, missing)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } } else { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestInheritance.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestInheritance.java index 5511341819f..fab3b9299a6 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestInheritance.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestInheritance.java @@ -21,7 +21,6 @@ import java.util.regex.Matcher; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.Info; -import org.unicode.cldr.tool.GenerateMaximalLocales; import org.unicode.cldr.tool.LikelySubtags; import org.unicode.cldr.util.Builder; import org.unicode.cldr.util.CLDRConfig; @@ -886,50 +885,6 @@ public void TestCldrFileConsistency() { Matcher aliasMatcher = PatternCache.get("//ldml.*/alias.*").matcher(""); - private String minimize(Map likelySubtags, String locale) { - String result = GenerateMaximalLocales.minimize(locale, likelySubtags, false); - if (result == null) { - LanguageTagParser ltp3 = new LanguageTagParser().set(locale); - List variants = ltp3.getVariants(); - Map extensions = ltp3.getExtensions(); - Set emptySet = Collections.emptySet(); - ltp3.setVariants(emptySet); - Map emptyMap = Collections.emptyMap(); - ltp3.setExtensions(emptyMap); - String newLocale = ltp3.toString(); - result = GenerateMaximalLocales.minimize(newLocale, likelySubtags, false); - if (result != null) { - ltp3.set(result); - ltp3.setVariants(variants); - ltp3.setExtensions(extensions); - result = ltp3.toString(); - } - } - return result; - } - - private String maximize(Map likelySubtags, String locale) { - String result = GenerateMaximalLocales.maximize(locale, likelySubtags); - if (result == null) { - LanguageTagParser ltp3 = new LanguageTagParser().set(locale); - List variants = ltp3.getVariants(); - Map extensions = ltp3.getExtensions(); - Set emptySet = Collections.emptySet(); - ltp3.setVariants(emptySet); - Map emptyMap = Collections.emptyMap(); - ltp3.setExtensions(emptyMap); - String newLocale = ltp3.toString(); - result = GenerateMaximalLocales.maximize(newLocale, likelySubtags); - if (result != null) { - ltp3.set(result); - ltp3.setVariants(variants); - ltp3.setExtensions(extensions); - result = ltp3.toString(); - } - } - return result; - } - // TODO move this into central utilities public static boolean equals(CharSequence string, int codePoint) { if (string == null) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java index ea84b5dd502..797de6f0b03 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestUnits.java @@ -40,7 +40,6 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.io.UncheckedIOException; import java.math.BigDecimal; import java.math.BigInteger; import java.math.MathContext; @@ -4173,7 +4172,7 @@ public void testUnitPreferencesTest() { warnln("Mixed unit identifiers not yet checked, count=" + warnings.size()); } } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } @@ -4247,7 +4246,7 @@ public void testUnitsTest() { Files.lines(Path.of(CLDRPaths.TEST_DATA + "units/unitsTest.txt")) .forEach(line -> checkUnitsTest(line)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } @@ -4292,7 +4291,7 @@ public void testUnitLocalePreferencesTest() { Files.lines(Path.of(CLDRPaths.TEST_DATA + "units/unitLocalePreferencesTest.txt")) .forEach(line -> checkUnitLocalePreferencesTest(line)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } @@ -4373,7 +4372,7 @@ public void testUnitLocalePreferencesTestIcu() { Files.lines(Path.of(CLDRPaths.TEST_DATA + "units/unitLocalePreferencesTest.txt")) .forEach(line -> checkUnitLocalePreferencesTestIcu(line)); } catch (IOException e) { - throw new UncheckedIOException(e); + throw new ICUUncheckedIOException(e); } } else { warnln("Skipping ICU test. To enable, set -DTestUnits:TEST_ICU"); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestPathHeader.java b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestPathHeader.java index d5acf9df356..02b030350df 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestPathHeader.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestPathHeader.java @@ -1,9 +1,10 @@ +package org.unicode.cldr.util; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.Test; -import org.unicode.cldr.util.*; public class TestPathHeader { static final String GREGORIAN = "gregorian";