From 29893164c54cb9c917b85d66a85f3537c45f4002 Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Thu, 29 May 2025 23:07:12 +0100 Subject: [PATCH] repro(DO NOT MERGE): cyrillic bug --- crates/milli/tests/cyrillic_bug_test.rs | 70 +++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 crates/milli/tests/cyrillic_bug_test.rs diff --git a/crates/milli/tests/cyrillic_bug_test.rs b/crates/milli/tests/cyrillic_bug_test.rs new file mode 100644 index 0000000000..d7a7b2e76a --- /dev/null +++ b/crates/milli/tests/cyrillic_bug_test.rs @@ -0,0 +1,70 @@ +// Test to validate the Cyrillic typo tolerance bug exists by demonstrating the problematic logic +#[test] +fn test_cyrillic_char_count_bug() { + // Test the key insight from the RCA: word.len() vs word.chars().count() + + // ASCII word "doggy" (5 chars, 5 bytes) + let ascii_word = "doggy"; + let ascii_byte_len = ascii_word.len(); + let ascii_char_count = ascii_word.chars().count(); + + // Cyrillic word "собак" (5 chars, 10 bytes) + let cyrillic_word = "собак"; + let cyrillic_byte_len = cyrillic_word.len(); + let cyrillic_char_count = cyrillic_word.chars().count(); + + eprintln!("ASCII '{}': byte_len={}, char_count={}", ascii_word, ascii_byte_len, ascii_char_count); + eprintln!("Cyrillic '{}': byte_len={}, char_count={}", cyrillic_word, cyrillic_byte_len, cyrillic_char_count); + + // Simulate the buggy logic with default settings (oneTypo=5, twoTypos=9) + let min_len_one_typo = 5; + let min_len_two_typos = 9; + + // Current buggy implementation uses word.len() (byte count) + let ascii_typos_buggy = if ascii_byte_len < min_len_one_typo { + 0 + } else if ascii_byte_len < min_len_two_typos { + 1 + } else { + 2 + }; + + let cyrillic_typos_buggy = if cyrillic_byte_len < min_len_one_typo { + 0 + } else if cyrillic_byte_len < min_len_two_typos { + 1 + } else { + 2 + }; + + eprintln!("Buggy logic (using byte count):"); + eprintln!(" ASCII '{}' gets {} typos", ascii_word, ascii_typos_buggy); + eprintln!(" Cyrillic '{}' gets {} typos", cyrillic_word, cyrillic_typos_buggy); + + // Correct implementation should use word.chars().count() + let ascii_typos_correct = if ascii_char_count < min_len_one_typo { + 0 + } else if ascii_char_count < min_len_two_typos { + 1 + } else { + 2 + }; + + let cyrillic_typos_correct = if cyrillic_char_count < min_len_one_typo { + 0 + } else if cyrillic_char_count < min_len_two_typos { + 1 + } else { + 2 + }; + + eprintln!("Correct logic (using character count):"); + eprintln!(" ASCII '{}' gets {} typos", ascii_word, ascii_typos_correct); + eprintln!(" Cyrillic '{}' gets {} typos", cyrillic_word, cyrillic_typos_correct); + + // **THE BUG**: ASCII and Cyrillic should get same typos (both have 5 chars) + // But buggy implementation gives them different typos - THIS IS THE FAILING ASSERTION + assert_eq!(ascii_typos_buggy, cyrillic_typos_buggy, + "BUG REPRODUCED: ASCII and Cyrillic words with same character count get different typo tolerance due to byte counting bug. ASCII: {}, Cyrillic: {}", + ascii_typos_buggy, cyrillic_typos_buggy); +}