From a97388a047666d4a3f5c399b992071fb432e4d58 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Tue, 7 May 2024 08:46:56 -0700 Subject: [PATCH] Add safety comments --- src/normalize.rs | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/normalize.rs b/src/normalize.rs index e878642..e59b667 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -71,6 +71,7 @@ where } #[inline] +#[allow(unsafe_code)] fn decompose(c: char, decompose_char: D, mut emit_char: F) where D: Fn(char) -> Option<&'static [char]>, @@ -84,7 +85,10 @@ where // Perform decomposition for Hangul if is_hangul_syllable(c) { - decompose_hangul(c, emit_char); + // Safety: Hangul Syllables invariant checked by is_hangul_syllable above + unsafe { + decompose_hangul(c, emit_char); + } return; } @@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1; // i.e. `T_BASE + 1 ..= T_LAST`. const T_FIRST: u32 = T_BASE + 1; +// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF) pub(crate) fn is_hangul_syllable(c: char) -> bool { + // Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) } // Decompose a precomposed Hangul syllable -#[allow(unsafe_code)] +// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF +#[allow(unsafe_code, unused_unsafe)] #[inline(always)] -fn decompose_hangul(s: char, mut emit_char: F) +unsafe fn decompose_hangul(s: char, mut emit_char: F) where F: FnMut(char), { + // This will be at most 0x2baf, the size of the Hangul Syllables block let s_index = s as u32 - S_BASE; + // This will be at most 0x2baf / (21 * 28), 19 let l_index = s_index / N_COUNT; unsafe { + // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(L_BASE + l_index)); + // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 let v_index = (s_index % N_COUNT) / T_COUNT; + // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(V_BASE + v_index)); + // Safety: This will be at most T_COUNT - 1 (27) let t_index = s_index % T_COUNT; if t_index > 0 { + // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(T_BASE + t_index)); } } @@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option { match (a, b) { // Compose a leading consonant and a vowel together into an LV_Syllable (L_BASE..=L_LAST, V_BASE..=V_LAST) => { + // Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19) + // and v_index will be <= V_COUNT (21) let l_index = a - L_BASE; let v_index = b - V_BASE; + // Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400. let lv_index = l_index * N_COUNT + v_index * T_COUNT; + // Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range + // for BMP unicode let s = S_BASE + lv_index; + // Safety: We've verified this is in-range Some(unsafe { char::from_u32_unchecked(s) }) } // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable (S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => { + // Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19. + // Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the + // surrogates start), so this is safe. Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) }) } _ => None,