Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure tables.rs passes rustfmt #96

Merged
merged 2 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen
run: cd scripts && python3 unicode.py
- name: Diff tables
Expand Down
59 changes: 27 additions & 32 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import urllib.request
from itertools import batched

UNICODE_VERSION = "15.1.0"
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
Expand Down Expand Up @@ -354,20 +355,26 @@ def is_first_and_last(first, last):
return False
return first[1:-8] == last[1:-7]

def gen_mph_data(name, d, kv_type, kv_callback):
def gen_mph_data(name, d, kv_type, kv_callback, kv_row_width):
(salt, keys) = minimal_perfect_hash(d)
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
for s in salt:
out.write(" 0x{:x},\n".format(s))
out.write(f"\npub(crate) const {name.upper()}_SALT: &[u16] = &[\n")
for s_row in batched(salt, 13):
out.write(" ")
for s in s_row:
out.write(f" 0x{s:03X},")
out.write("\n")
out.write("];\n")
out.write(f"pub(crate) const {name.upper()}_KV: &[{kv_type}] = &[\n")
for k_row in batched(keys, kv_row_width):
out.write(" ")
for k in k_row:
out.write(f" {kv_callback(k)},")
out.write("\n")
out.write("];\n")
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
for k in keys:
out.write(" {},\n".format(kv_callback(k)))
out.write("];\n\n")

def gen_combining_class(combining_classes, out):
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
lambda k: f"0x{int(combining_classes[k]) | (k << 8):07X}", 8)

def gen_composition_table(canon_comp, out):
table = {}
Expand All @@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
table[(c1 << 16) | c2] = c3
(salt, keys) = minimal_perfect_hash(table)
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
lambda k: f"(0x{k:08X}, '\\u{{{table[k]:06X}}}')", 1)

out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
out.write(" match (c1, c2) {\n")
Expand All @@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
assert offset < 65536
out.write("];\n")
gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k])))
lambda k: f"(0x{k:05X}, (0x{offsets[k]:03X}, 0x{len(table[k]):X}))", 1)

def gen_qc_match(prop_table, out):
out.write(" match c {\n")
Expand All @@ -412,7 +419,7 @@ def gen_qc_match(prop_table, out):
assert data in ('N', 'M')
result = "No" if data == 'N' else "Maybe"
if high:
out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
out.write(r" '\u{%s}'..='\u{%s}' => %s," % (low, high, result))
else:
out.write(r" '\u{%s}' => %s," % (low, result))
out.write("\n")
Expand All @@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
out.write(" }\n")

def gen_nfc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("\n#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFC_QC'], out)
Expand Down Expand Up @@ -450,7 +457,7 @@ def gen_nfkd_qc(prop_tables, out):

def gen_combining_mark(general_category_mark, out):
gen_mph_data('combining_mark', general_category_mark, 'u32',
lambda k: '0x{:04x}'.format(k))
lambda k: '0x{:05X}'.format(k), 10)

def gen_public_assigned(general_category_public_assigned, out):
# This could be done as a hash but the table is somewhat small.
Expand All @@ -464,17 +471,16 @@ def gen_public_assigned(general_category_public_assigned, out):
out.write(" ")
start = False
else:
out.write(" | ")
out.write("\n | ")
if first == last:
out.write("'\\u{%s}'\n" % hexify(first))
out.write("'\\u{%s}'" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
out.write(" => true,\n")
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
out.write(" => true,\n")

out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")

def gen_stream_safe(leading, trailing, out):
# This could be done as a hash but the table is very small.
Expand All @@ -488,10 +494,9 @@ def gen_stream_safe(leading, trailing, out):
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")

gen_mph_data('trailing_nonstarters', trailing, 'u32',
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
lambda k: f"0x{int(trailing[k]) | (k << 8):07X}", 8)

def gen_tests(tests, out):
out.write("""#[derive(Debug)]
Expand Down Expand Up @@ -579,43 +584,33 @@ def minimal_perfect_hash(d):
data = UnicodeData()
with open("tables.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
out.write("#![cfg_attr(rustfmt, rustfmt::skip)]\n")
out.write("use crate::quick_check::IsNormalized;\n")
out.write("use crate::quick_check::IsNormalized::*;\n")
out.write("\n")

version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
out.write("#[allow(unused)]\n")
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n" % version)

gen_combining_class(data.combining_classes, out)
out.write("\n")

gen_composition_table(data.canon_comp, out)
out.write("\n")

gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)

gen_combining_mark(data.general_category_mark, out)
out.write("\n")

gen_public_assigned(data.general_category_public_assigned, out)
out.write("\n")

gen_nfc_qc(data.norm_props, out)
out.write("\n")

gen_nfkc_qc(data.norm_props, out)
out.write("\n")

gen_nfd_qc(data.norm_props, out)
out.write("\n")

gen_nfkd_qc(data.norm_props, out)
out.write("\n")

gen_stream_safe(data.ss_leading, data.ss_trailing, out)
out.write("\n")

with open("normalization_tests.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
Expand Down
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ mod quick_check;
mod recompose;
mod replace;
mod stream_safe;

#[rustfmt::skip]
mod tables;

#[doc(hidden)]
Expand Down
10 changes: 5 additions & 5 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ const L_LAST: u32 = L_BASE + L_COUNT - 1;
const V_LAST: u32 = V_BASE + V_COUNT - 1;
const T_LAST: u32 = T_BASE + T_COUNT - 1;

// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
// i.e. `T_BASE + 1 ... T_LAST`.
// Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`,
// i.e. `T_BASE + 1 ..= T_LAST`.
const T_FIRST: u32 = T_BASE + 1;

pub(crate) fn is_hangul_syllable(c: char) -> bool {
Expand Down Expand Up @@ -172,15 +172,15 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
let (a, b) = (a as u32, b as u32);
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE...L_LAST, V_BASE...V_LAST) => {
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
let l_index = a - L_BASE;
let v_index = b - V_BASE;
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
let s = S_BASE + lv_index;
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
Expand All @@ -193,7 +193,7 @@ mod tests {

// Regression test from a bugfix where we were composing an LV_Syllable with
// T_BASE directly. (We should only compose an LV_Syllable with a character
// in the range `T_BASE + 1 ... T_LAST`.)
// in the range `T_BASE + 1 ..= T_LAST`.)
#[test]
fn test_hangul_composition() {
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
Expand Down
Loading
Loading