From ba66696c3e1cd43b444dc81360339cc77040a3d3 Mon Sep 17 00:00:00 2001 From: Kim Davies Date: Mon, 16 Sep 2024 13:20:27 -0700 Subject: [PATCH] WIP: Pre-compute UTS46 tests as part of make-libdata Rather than generate them dynamically, pre-compute them just like we do for the UTS46 data itself. --- tools/idna-data | 670 +++++++++++++++++++++++++++--------------------- 1 file changed, 384 insertions(+), 286 deletions(-) diff --git a/tools/idna-data b/tools/idna-data index 5c44ec1..5e4a057 100755 --- a/tools/idna-data +++ b/tools/idna-data @@ -4,83 +4,78 @@ import argparse, collections, datetime, os, re, sys, unicodedata from urllib.request import urlopen # Use intranges.intranges_from_list() from the sibling idna directory -sys.path.append( - os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "idna" - ) -) +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "idna")) from intranges import intranges_from_list if sys.version_info[0] < 3: print("Only Python 3 supported.") sys.exit(2) -PREFERRED_VERSION = '16.0.0' -UCD_URL = 'http://www.unicode.org/Public/{version}/ucd/{filename}' -UTS46_URL = 'http://www.unicode.org/Public/idna/{version}/{filename}' +PREFERRED_VERSION = "16.0.0" +UCD_URL = "http://www.unicode.org/Public/{version}/ucd/{filename}" +UTS46_URL = "http://www.unicode.org/Public/idna/{version}/{filename}" -DEFAULT_CACHE_DIR = '~/.cache/unidata' +DEFAULT_CACHE_DIR = "~/.cache/unidata" # Scripts affected by IDNA contextual rules -SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana']) +SCRIPT_WHITELIST = sorted(["Greek", "Han", "Hebrew", "Hiragana", "Katakana"]) # Used to piece apart UTS#46 data for Jython compatibility UTS46_SEGMENT_SIZE = 100 UTS46_STATUSES = { - 'valid': ('V', False), - 'ignored': ('I', False), - 'mapped': ('M', True), - 'deviation': ('D', True), - 'disallowed': ('X', False), - 'disallowed_STD3_valid': ('3', False), - 'disallowed_STD3_mapped': ('3', True) + "valid": ("V", False), + "ignored": ("I", False), + "mapped": ("M", True), + "deviation": ("D", True), + "disallowed": ("X", False), + "disallowed_STD3_valid": ("3", False), + "disallowed_STD3_mapped": ("3", True), } # Exceptions are manually assigned in Section 2.6 of RFC 5892. exceptions = { - 0x00DF: 'PVALID', # LATIN SMALL LETTER SHARP S - 0x03C2: 'PVALID', # GREEK SMALL LETTER FINAL SIGMA - 0x06FD: 'PVALID', # ARABIC SIGN SINDHI AMPERSAND - 0x06FE: 'PVALID', # ARABIC SIGN SINDHI POSTPOSITION MEN - 0x0F0B: 'PVALID', # TIBETAN MARK INTERSYLLABIC TSHEG - 0x3007: 'PVALID', # IDEOGRAPHIC NUMBER ZERO - 0x00B7: 'CONTEXTO', # MIDDLE DOT - 0x0375: 'CONTEXTO', # GREEK LOWER NUMERAL SIGN (KERAIA) - 0x05F3: 'CONTEXTO', # HEBREW PUNCTUATION GERESH - 0x05F4: 'CONTEXTO', # HEBREW PUNCTUATION GERSHAYIM - 0x30FB: 'CONTEXTO', # KATAKANA MIDDLE DOT - 0x0660: 'CONTEXTO', # ARABIC-INDIC DIGIT ZERO - 0x0661: 'CONTEXTO', # ARABIC-INDIC DIGIT ONE - 0x0662: 'CONTEXTO', # ARABIC-INDIC DIGIT TWO - 0x0663: 'CONTEXTO', # ARABIC-INDIC DIGIT THREE - 0x0664: 'CONTEXTO', # ARABIC-INDIC DIGIT FOUR - 0x0665: 'CONTEXTO', # ARABIC-INDIC DIGIT FIVE - 0x0666: 'CONTEXTO', # ARABIC-INDIC DIGIT SIX - 0x0667: 'CONTEXTO', # ARABIC-INDIC DIGIT SEVEN - 0x0668: 'CONTEXTO', # ARABIC-INDIC DIGIT EIGHT - 0x0669: 'CONTEXTO', # ARABIC-INDIC DIGIT NINE - 0x06F0: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ZERO - 0x06F1: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ONE - 0x06F2: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT TWO - 0x06F3: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT THREE - 0x06F4: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FOUR - 0x06F5: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FIVE - 0x06F6: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SIX - 0x06F7: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SEVEN - 0x06F8: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT EIGHT - 0x06F9: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT NINE - 0x0640: 'DISALLOWED', # ARABIC TATWEEL - 0x07FA: 'DISALLOWED', # NKO LAJANYALAN - 0x302E: 'DISALLOWED', # HANGUL SINGLE DOT TONE MARK - 0x302F: 'DISALLOWED', # HANGUL DOUBLE DOT TONE MARK - 0x3031: 'DISALLOWED', # VERTICAL KANA REPEAT MARK - 0x3032: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK - 0x3033: 'DISALLOWED', # VERTICAL KANA REPEAT MARK UPPER HALF - 0x3034: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA - 0x3035: 'DISALLOWED', # VERTICAL KANA REPEAT MARK LOWER HALF - 0x303B: 'DISALLOWED', # VERTICAL IDEOGRAPHIC ITERATION MARK + 0x00DF: "PVALID", # LATIN SMALL LETTER SHARP S + 0x03C2: "PVALID", # GREEK SMALL LETTER FINAL SIGMA + 0x06FD: "PVALID", # ARABIC SIGN SINDHI AMPERSAND + 0x06FE: "PVALID", # ARABIC SIGN SINDHI POSTPOSITION MEN + 0x0F0B: "PVALID", # TIBETAN MARK INTERSYLLABIC TSHEG + 0x3007: "PVALID", # IDEOGRAPHIC NUMBER ZERO + 0x00B7: "CONTEXTO", # MIDDLE DOT + 0x0375: "CONTEXTO", # GREEK LOWER NUMERAL SIGN (KERAIA) + 0x05F3: "CONTEXTO", # HEBREW PUNCTUATION GERESH + 0x05F4: "CONTEXTO", # HEBREW PUNCTUATION GERSHAYIM + 0x30FB: "CONTEXTO", # KATAKANA MIDDLE DOT + 0x0660: "CONTEXTO", # ARABIC-INDIC DIGIT ZERO + 0x0661: "CONTEXTO", # ARABIC-INDIC DIGIT ONE + 0x0662: "CONTEXTO", # ARABIC-INDIC DIGIT TWO + 0x0663: "CONTEXTO", # ARABIC-INDIC DIGIT THREE + 0x0664: "CONTEXTO", # ARABIC-INDIC DIGIT FOUR + 0x0665: "CONTEXTO", # ARABIC-INDIC DIGIT FIVE + 0x0666: "CONTEXTO", # ARABIC-INDIC DIGIT SIX + 0x0667: "CONTEXTO", # ARABIC-INDIC DIGIT SEVEN + 0x0668: "CONTEXTO", # ARABIC-INDIC DIGIT EIGHT + 0x0669: "CONTEXTO", # ARABIC-INDIC DIGIT NINE + 0x06F0: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ZERO + 0x06F1: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ONE + 0x06F2: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT TWO + 0x06F3: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT THREE + 0x06F4: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FOUR + 0x06F5: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FIVE + 0x06F6: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SIX + 0x06F7: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SEVEN + 0x06F8: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT EIGHT + 0x06F9: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT NINE + 0x0640: "DISALLOWED", # ARABIC TATWEEL + 0x07FA: "DISALLOWED", # NKO LAJANYALAN + 0x302E: "DISALLOWED", # HANGUL SINGLE DOT TONE MARK + 0x302F: "DISALLOWED", # HANGUL DOUBLE DOT TONE MARK + 0x3031: "DISALLOWED", # VERTICAL KANA REPEAT MARK + 0x3032: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK + 0x3033: "DISALLOWED", # VERTICAL KANA REPEAT MARK UPPER HALF + 0x3034: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA + 0x3035: "DISALLOWED", # VERTICAL KANA REPEAT MARK LOWER HALF + 0x303B: "DISALLOWED", # VERTICAL IDEOGRAPHIC ITERATION MARK } backwardscompatible = {} @@ -88,33 +83,45 @@ backwardscompatible = {} def hexrange(start, end): return range(int(start, 16), int(end, 16) + 1) + def hexvalue(value): return int(value, 16) -class UnicodeVersion(object): +_RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})") +_RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]") + + +def unicode_fixup(string): + """Replace backslash-u-XXXX with appropriate unicode characters.""" + return _RE_SURROGATE.sub( + lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000), + _RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string), + ) + +class UnicodeVersion(object): def __init__(self, version): - result = re.match(r'^(?P\d+)\.(?P\d+)\.(?P\d+)$', version) + result = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)$", version) if result: - self.major = int(result.group('major')) - self.minor = int(result.group('minor')) - self.patch = int(result.group('patch')) + self.major = int(result.group("major")) + self.minor = int(result.group("minor")) + self.patch = int(result.group("patch")) self.numerical = (self.major << 8) + (self.minor << 4) + self.patch self.latest = False - elif version == 'latest': + elif version == "latest": self.latest = True else: - raise ValueError('Unrecognized Unicode version') + raise ValueError("Unrecognized Unicode version") def __repr__(self, with_date=True): if self.latest: if with_date: - return 'latest@{}'.format(datetime.datetime.now().strftime('%Y-%m-%d')) + return "latest@{}".format(datetime.datetime.now().strftime("%Y-%m-%d")) else: - return 'latest' + return "latest" else: - return '{}.{}.{}'.format(self.major, self.minor, self.patch) + return "{}.{}.{}".format(self.major, self.minor, self.patch) @property def tag(self): @@ -132,7 +139,6 @@ class UnicodeVersion(object): class UnicodeData(object): - def __init__(self, version, cache, args): self.version = UnicodeVersion(version) self.system_version = UnicodeVersion(unicodedata.unidata_version) @@ -141,8 +147,11 @@ class UnicodeData(object): self.max = 0 if self.system_version < self.version: - print('Warning: Character stability not guaranteed as Python Unicode data {}' - ' older than requested {}'.format(self.system_version, self.version)) + print( + "Warning: Character stability not guaranteed as Python Unicode data {} older than requested {}".format( + self.system_version, self.version + ) + ) self._load_unicodedata() self._load_proplist() @@ -153,174 +162,160 @@ class UnicodeData(object): self._load_arabicshaping() self._load_scripts() self._load_uts46mapping() + self._load_uts46testvectors() def _load_unicodedata(self): - - f_ud = self._ucdfile('UnicodeData.txt') + f_ud = self._ucdfile("UnicodeData.txt") self.ucd_data = {} range_begin = None for line in f_ud.splitlines(): - fields = line.split(';') + fields = line.split(";") value = int(fields[0], 16) - start_marker = re.match('^<(?P.*?), First>$', fields[1]) - end_marker = re.match('^<(?P.*?), Last>$', fields[1]) + start_marker = re.match("^<(?P.*?), First>$", fields[1]) + end_marker = re.match("^<(?P.*?), Last>$", fields[1]) if start_marker: range_begin = value elif end_marker: - for i in range(range_begin, value+1): - fields[1] = '<{}>'.format(end_marker.group('name')) + for i in range(range_begin, value + 1): + fields[1] = "<{}>".format(end_marker.group("name")) self.ucd_data[i] = fields[1:] range_begin = None else: self.ucd_data[value] = fields[1:] def _load_proplist(self): - - f_pl = self._ucdfile('PropList.txt') + f_pl = self._ucdfile("PropList.txt") self.ucd_props = collections.defaultdict(list) for line in f_pl.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$", line) if result: - if result.group('end'): - for i in hexrange(result.group('start'), result.group('end')): - self.ucd_props[i].append(result.group('prop')) + if result.group("end"): + for i in hexrange(result.group("start"), result.group("end")): + self.ucd_props[i].append(result.group("prop")) else: - i = hexvalue(result.group('start')) - self.ucd_props[i].append(result.group('prop')) + i = hexvalue(result.group("start")) + self.ucd_props[i].append(result.group("prop")) def _load_derivedcoreprops(self): - - f_dcp = self._ucdfile('DerivedCoreProperties.txt') + f_dcp = self._ucdfile("DerivedCoreProperties.txt") for line in f_dcp.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$", line) if result: - if result.group('end'): - for i in hexrange(result.group('start'), result.group('end')): - self.ucd_props[i].append(result.group('prop')) + if result.group("end"): + for i in hexrange(result.group("start"), result.group("end")): + self.ucd_props[i].append(result.group("prop")) else: - i = hexvalue(result.group('start')) - self.ucd_props[i].append(result.group('prop')) + i = hexvalue(result.group("start")) + self.ucd_props[i].append(result.group("prop")) def _load_blocks(self): - self.ucd_block = {} - f_b = self._ucdfile('Blocks.txt') + f_b = self._ucdfile("Blocks.txt") for line in f_b.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})\.\.(?P[0-9A-F]{4,6})\s*;\s*(?P.*)\s*$', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})\.\.(?P[0-9A-F]{4,6})\s*;\s*(?P.*)\s*$", line) if result: - for i in hexrange(result.group('start'), result.group('end')): - self.ucd_block[i] = result.group('block') + for i in hexrange(result.group("start"), result.group("end")): + self.ucd_block[i] = result.group("block") self.max = max(self.max, i) def _load_casefolding(self): - self.ucd_cf = {} - f_cf = self._ucdfile('CaseFolding.txt') + f_cf = self._ucdfile("CaseFolding.txt") for line in f_cf.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})\s*;\s*(?P\S+)\s*;\s*(?P[0-9A-F\s]+)\s*', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})\s*;\s*(?P\S+)\s*;\s*(?P[0-9A-F\s]+)\s*", line) if result: - if result.group('type') in ('C', 'F'): - self.ucd_cf[int(result.group('cp'), 16)] = \ - ''.join([chr(int(x, 16)) for x in result.group('subst').split(' ')]) + if result.group("type") in ("C", "F"): + self.ucd_cf[int(result.group("cp"), 16)] = "".join( + [chr(int(x, 16)) for x in result.group("subst").split(" ")] + ) def _load_hangulst(self): - self.ucd_hst = {} - f_hst = self._ucdfile('HangulSyllableType.txt') + f_hst = self._ucdfile("HangulSyllableType.txt") for line in f_hst.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})\.\.(?P[0-9A-F]{4,6})\s*;\s*(?P\S+)\s*(|\#.*)$', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})\.\.(?P[0-9A-F]{4,6})\s*;\s*(?P\S+)\s*(|\#.*)$", line) if result: - for i in hexrange(result.group('start'), result.group('end')): - self.ucd_hst[i] = result.group('type') + for i in hexrange(result.group("start"), result.group("end")): + self.ucd_hst[i] = result.group("type") def _load_arabicshaping(self): - self.ucd_as = {} - f_as = self._ucdfile('extracted/DerivedJoiningType.txt') + f_as = self._ucdfile("extracted/DerivedJoiningType.txt") for line in f_as.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$', - line) + result = re.match(r"^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P\S+)\s*(|\#.*)$", line) if result: - if result.group('end'): - for i in hexrange(result.group('start'), result.group('end')): - self.ucd_as[i] = result.group('jt') + if result.group("end"): + for i in hexrange(result.group("start"), result.group("end")): + self.ucd_as[i] = result.group("jt") else: - i = hexvalue(result.group('start')) - self.ucd_as[i] = result.group('jt') + i = hexvalue(result.group("start")) + self.ucd_as[i] = result.group("jt") def _load_scripts(self): - self.ucd_s = {} - f_s = self._ucdfile('Scripts.txt') + f_s = self._ucdfile("Scripts.txt") for line in f_s.splitlines(): - result = re.match( - r'^(?P[0-9A-F]{4,6})(|\.\.(?P[0-9A-F]{4,6}))\s*;\s*(?P