diff --git a/Cargo.lock b/Cargo.lock index d3a59a95d00..5a9af63f9cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -990,6 +990,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fixed_decimal" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35943d22b2f19c0cb198ecf915910a8158e94541c89dcc63300d7799d46c2c5e" +dependencies = [ + "displaydoc", + "smallvec", + "writeable", +] + [[package]] name = "flate2" version = "1.1.2" @@ -1290,6 +1301,29 @@ dependencies = [ "zerovec", ] +[[package]] +name = "icu_decimal" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fec61c43fdc4e368a9f450272833123a8ef0d7083a44597660ce94d791b8a2e2" +dependencies = [ + "displaydoc", + "fixed_decimal", + "icu_decimal_data", + "icu_locale", + "icu_locale_core", + "icu_provider", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_decimal_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b70963bc35f9bdf1bc66a5c1f458f4991c1dc71760e00fa06016b2c76b2738d5" + [[package]] name = "icu_locale" version = "2.0.0" @@ -3946,7 +3980,9 @@ dependencies = [ "glob", "hex", "icu_collator", + "icu_decimal", "icu_locale", + "icu_provider", "itertools 0.14.0", "libc", "md-5", diff --git a/Cargo.toml b/Cargo.toml index 5d9479bc8f4..0ec32fb39b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,7 +314,9 @@ glob = "0.3.1" half = "2.4.1" hostname = "0.4" icu_collator = "2.0.0" +icu_decimal = "2.0.0" icu_locale = "2.0.0" +icu_provider = "2.0.0" indicatif = "0.18.0" itertools = "0.14.0" jiff = { version = "0.2.10", default-features = false, features = [ diff --git a/src/uu/expr/Cargo.toml b/src/uu/expr/Cargo.toml index 00e3e3cab03..54b831bc487 100644 --- a/src/uu/expr/Cargo.toml +++ b/src/uu/expr/Cargo.toml @@ -22,7 +22,7 @@ clap = { workspace = true } num-bigint = { workspace = true } num-traits = { workspace = true } onig = { workspace = true } -uucore = { workspace = true } +uucore = { workspace = true, features = ["i18n-collator"] } thiserror = { workspace = true } [[bin]] diff --git a/src/uu/expr/locales/en-US.ftl b/src/uu/expr/locales/en-US.ftl index 8d26566e9a6..2c09eee7840 100644 --- a/src/uu/expr/locales/en-US.ftl +++ b/src/uu/expr/locales/en-US.ftl @@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Unmatched {"\\{"} expr-error-invalid-bracket-content = Invalid content of {"\\{\\}"} expr-error-trailing-backslash = Trailing backslash expr-error-too-big-range-quantifier-index = Regular expression too big +expr-error-match-utf8 = match does not support invalid UTF-8 encoding in { $arg } diff --git a/src/uu/expr/locales/fr-FR.ftl b/src/uu/expr/locales/fr-FR.ftl index f496b270ce7..b529db9d523 100644 --- a/src/uu/expr/locales/fr-FR.ftl +++ b/src/uu/expr/locales/fr-FR.ftl @@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Accolade ouvrante {"\\{"} non appariée expr-error-invalid-bracket-content = Contenu invalide de {"\\{\\}"} expr-error-trailing-backslash = Barre oblique inverse en fin expr-error-too-big-range-quantifier-index = Expression régulière trop grande +expr-error-match-utf8 = match ne supporte pas l'encodage UTF-8 invalide dans { $arg } diff --git a/src/uu/expr/src/expr.rs b/src/uu/expr/src/expr.rs index 7225f986cf9..4db419efaf3 100644 --- a/src/uu/expr/src/expr.rs +++ b/src/uu/expr/src/expr.rs @@ -5,15 +5,18 @@ use clap::{Arg, ArgAction, Command}; use std::collections::HashMap; +use std::io::Write; use syntax_tree::{AstNode, is_truthy}; use thiserror::Error; use uucore::locale::{get_message, get_message_with_args}; +use uucore::os_string_to_vec; use uucore::{ display::Quotable, error::{UError, UResult}, format_usage, }; +mod locale_aware; mod syntax_tree; mod options { @@ -54,6 +57,8 @@ pub enum ExprError { TrailingBackslash, #[error("{}", get_message("expr-error-too-big-range-quantifier-index"))] TooBigRangeQuantifierIndex, + #[error("{}", get_message_with_args("expr-error-match-utf8", HashMap::from([("arg".to_string(), _0.quote().to_string())])))] + UnsupportedNonUtf8Match(String), } impl UError for ExprError { @@ -98,25 +103,27 @@ pub fn uu_app() -> Command { pub fn uumain(args: impl uucore::Args) -> UResult<()> { // For expr utility we do not want getopts. // The following usage should work without escaping hyphens: `expr -15 = 1 + 2 \* \( 3 - -4 \)` - let args: Vec = args + let args = args .skip(1) // Skip binary name - .map(|a| a.to_string_lossy().to_string()) - .collect(); + .map(os_string_to_vec) + .collect::, _>>()?; - if args.len() == 1 && args[0] == "--help" { + if args.len() == 1 && args[0] == b"--help" { let _ = uu_app().print_help(); - } else if args.len() == 1 && args[0] == "--version" { + } else if args.len() == 1 && args[0] == b"--version" { println!("{} {}", uucore::util_name(), uucore::crate_version!()); } else { // The first argument may be "--" and should be be ignored. - let args = if !args.is_empty() && args[0] == "--" { + let args = if !args.is_empty() && args[0] == b"--" { &args[1..] } else { &args }; - let res: String = AstNode::parse(args)?.eval()?.eval_as_string(); - println!("{res}"); + let res = AstNode::parse(args)?.eval()?.eval_as_string(); + let _ = std::io::stdout().write_all(&res); + let _ = std::io::stdout().write_all(b"\n"); + if !is_truthy(&res.into()) { return Err(1.into()); } diff --git a/src/uu/expr/src/locale_aware.rs b/src/uu/expr/src/locale_aware.rs new file mode 100644 index 00000000000..c8a4e73e33e --- /dev/null +++ b/src/uu/expr/src/locale_aware.rs @@ -0,0 +1,111 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::cmp::Ordering; + +use uucore::{ + CharByte, IntoCharByteIterator, + i18n::{ + UEncoding, + collator::{AlternateHandling, CollatorOptions, locale_cmp, try_init_collator}, + get_locale_encoding, + }, +}; + +use crate::syntax_tree::{MaybeNonUtf8Str, MaybeNonUtf8String}; + +/// Perform a locale-aware string comparison using the current locale's +/// collator. +pub(crate) fn locale_comparison(a: &MaybeNonUtf8Str, b: &MaybeNonUtf8Str) -> Ordering { + // Initialize the collator + let mut opts = CollatorOptions::default(); + opts.alternate_handling = Some(AlternateHandling::Shifted); // This is black magic + let _ = try_init_collator(opts); + + locale_cmp(a, b) +} + +/// Perform an index search with an approach that differs with regard to the +/// given locale. +fn index_with_locale( + left: &MaybeNonUtf8Str, + right: &MaybeNonUtf8Str, + encoding: UEncoding, +) -> usize { + match encoding { + UEncoding::Utf8 => { + // In the UTF-8 case, we try to decode the strings on the fly. We + // compare UTf-8 characters as long as the stream is valid, and + // switch to byte comparison when the byte is an invalid sequence. + left.iter_char_bytes() + .position(|ch_h| right.iter_char_bytes().any(|ch_n| ch_n == ch_h)) + .map_or(0, |idx| idx + 1) + } + UEncoding::Ascii => { + // In the default case, we just perform byte-wise comparison on the + // arrays. + left.iter() + .position(|ch_h| right.iter().any(|ch_n| ch_n == ch_h)) + .map_or(0, |idx| idx + 1) + } + } +} + +/// Perform an index search with an approach that differs with regard to the +/// current locale. +pub(crate) fn locale_aware_index(left: &MaybeNonUtf8Str, right: &MaybeNonUtf8Str) -> usize { + index_with_locale(left, right, get_locale_encoding()) +} + +/// Perform a string length calculation depending on the current locale. In +/// UTF-8 locale, it will count valid UTF-8 chars, and fallback to counting +/// bytes otherwise. In Non UTF-8 locale, directly return input byte length. +pub(crate) fn locale_aware_length(input: &MaybeNonUtf8Str) -> usize { + match get_locale_encoding() { + UEncoding::Utf8 => std::str::from_utf8(input).map_or(input.len(), |s| s.chars().count()), + UEncoding::Ascii => input.len(), + } +} + +fn substr_with_locale( + s: MaybeNonUtf8String, + pos: usize, + len: usize, + encoding: UEncoding, +) -> MaybeNonUtf8String { + match encoding { + UEncoding::Utf8 => { + // Create a buffer with the heuristic that all the chars are ASCII + // and are 1-byte long. + let mut string = MaybeNonUtf8String::with_capacity(len); + let mut buf = [0; 4]; + + // Iterate on char-bytes, and skip them accordingly. + // For each character (or byte) in the right range, + // push it to the string. + for cb in s.iter_char_bytes().skip(pos).take(len) { + match cb { + CharByte::Char(c) => { + let len = c.encode_utf8(&mut buf).len(); + string.extend(&buf[..len]); + } + CharByte::Byte(b) => string.push(b), + } + } + string + } + UEncoding::Ascii => s.into_iter().skip(pos).take(len).collect(), + } +} + +/// Given a byte sequence, a position and a length, return the corresponding +/// substring depending on the current locale. +pub(crate) fn locale_aware_substr( + s: MaybeNonUtf8String, + pos: usize, + len: usize, +) -> MaybeNonUtf8String { + substr_with_locale(s, pos, len, get_locale_encoding()) +} diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index b0ae0142f92..f2e56717368 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -7,11 +7,19 @@ use std::{cell::Cell, collections::BTreeMap}; -use num_bigint::{BigInt, ParseBigIntError}; +use num_bigint::BigInt; use num_traits::ToPrimitive; use onig::{Regex, RegexOptions, Syntax}; -use crate::{ExprError, ExprResult}; +use crate::{ + ExprError, ExprResult, + locale_aware::{ + locale_aware_index, locale_aware_length, locale_aware_substr, locale_comparison, + }, +}; + +pub(crate) type MaybeNonUtf8String = Vec; +pub(crate) type MaybeNonUtf8Str = [u8]; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BinOp { @@ -63,29 +71,27 @@ impl BinOp { impl RelationOp { fn eval(&self, a: ExprResult, b: ExprResult) -> ExprResult { + // Make sure that the given comparison validates the relational operator. + let check_cmp = |cmp| { + use RelationOp::{Eq, Geq, Gt, Leq, Lt, Neq}; + use std::cmp::Ordering::{Equal, Greater, Less}; + matches!( + (self, cmp), + (Lt | Leq | Neq, Less) | (Leq | Eq | Geq, Equal) | (Gt | Geq | Neq, Greater) + ) + }; + let a = a?; let b = b?; - let b = if let (Ok(a), Ok(b)) = (&a.to_bigint(), &b.to_bigint()) { - match self { - Self::Lt => a < b, - Self::Leq => a <= b, - Self::Eq => a == b, - Self::Neq => a != b, - Self::Gt => a > b, - Self::Geq => a >= b, - } + let b = if let (Some(a), Some(b)) = (&a.to_bigint(), &b.to_bigint()) { + check_cmp(a.cmp(b)) } else { // These comparisons should be using locale settings + let a = a.eval_as_string(); let b = b.eval_as_string(); - match self { - Self::Lt => a < b, - Self::Leq => a <= b, - Self::Eq => a == b, - Self::Neq => a != b, - Self::Gt => a > b, - Self::Geq => a >= b, - } + + check_cmp(locale_comparison(&a, &b)) }; if b { Ok(1.into()) } else { Ok(0.into()) } } @@ -147,8 +153,17 @@ impl StringOp { Ok(left) } Self::Match => { - let left = left?.eval_as_string(); - let right = right?.eval_as_string(); + let left = String::from_utf8(left?.eval_as_string()).map_err(|u| { + ExprError::UnsupportedNonUtf8Match( + String::from_utf8_lossy(u.as_bytes()).into_owned(), + ) + })?; + let right = String::from_utf8(right?.eval_as_string()).map_err(|u| { + ExprError::UnsupportedNonUtf8Match( + String::from_utf8_lossy(u.as_bytes()).into_owned(), + ) + })?; + check_posix_regex_errors(&right)?; // Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep` @@ -237,14 +252,8 @@ impl StringOp { Self::Index => { let left = left?.eval_as_string(); let right = right?.eval_as_string(); - for (current_idx, ch_h) in left.chars().enumerate() { - for ch_n in right.to_string().chars() { - if ch_n == ch_h { - return Ok((current_idx + 1).into()); - } - } - } - Ok(0.into()) + + Ok(locale_aware_index(&left, &right).into()) } } } @@ -361,33 +370,33 @@ fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> { } /// Precedence for infix binary operators -const PRECEDENCE: &[&[(&str, BinOp)]] = &[ - &[("|", BinOp::String(StringOp::Or))], - &[("&", BinOp::String(StringOp::And))], +const PRECEDENCE: &[&[(&MaybeNonUtf8Str, BinOp)]] = &[ + &[(b"|", BinOp::String(StringOp::Or))], + &[(b"&", BinOp::String(StringOp::And))], &[ - ("<", BinOp::Relation(RelationOp::Lt)), - ("<=", BinOp::Relation(RelationOp::Leq)), - ("=", BinOp::Relation(RelationOp::Eq)), - ("!=", BinOp::Relation(RelationOp::Neq)), - (">=", BinOp::Relation(RelationOp::Geq)), - (">", BinOp::Relation(RelationOp::Gt)), + (b"<", BinOp::Relation(RelationOp::Lt)), + (b"<=", BinOp::Relation(RelationOp::Leq)), + (b"=", BinOp::Relation(RelationOp::Eq)), + (b"!=", BinOp::Relation(RelationOp::Neq)), + (b">=", BinOp::Relation(RelationOp::Geq)), + (b">", BinOp::Relation(RelationOp::Gt)), ], &[ - ("+", BinOp::Numeric(NumericOp::Add)), - ("-", BinOp::Numeric(NumericOp::Sub)), + (b"+", BinOp::Numeric(NumericOp::Add)), + (b"-", BinOp::Numeric(NumericOp::Sub)), ], &[ - ("*", BinOp::Numeric(NumericOp::Mul)), - ("/", BinOp::Numeric(NumericOp::Div)), - ("%", BinOp::Numeric(NumericOp::Mod)), + (b"*", BinOp::Numeric(NumericOp::Mul)), + (b"/", BinOp::Numeric(NumericOp::Div)), + (b"%", BinOp::Numeric(NumericOp::Mod)), ], - &[(":", BinOp::String(StringOp::Match))], + &[(b":", BinOp::String(StringOp::Match))], ]; #[derive(Debug, Clone, PartialEq, Eq)] pub enum NumOrStr { Num(BigInt), - Str(String), + Str(MaybeNonUtf8String), } impl From for NumOrStr { @@ -404,30 +413,37 @@ impl From for NumOrStr { impl From for NumOrStr { fn from(str: String) -> Self { + Self::Str(str.into()) + } +} + +impl From for NumOrStr { + fn from(str: MaybeNonUtf8String) -> Self { Self::Str(str) } } impl NumOrStr { - pub fn to_bigint(&self) -> Result { + pub fn to_bigint(&self) -> Option { match self { - Self::Num(num) => Ok(num.clone()), - Self::Str(str) => str.parse::(), + Self::Num(num) => Some(num.clone()), + Self::Str(str) => std::str::from_utf8(str).ok()?.parse::().ok(), } } pub fn eval_as_bigint(self) -> ExprResult { match self { Self::Num(num) => Ok(num), - Self::Str(str) => str + Self::Str(str) => String::from_utf8(str) + .map_err(|_| ExprError::NonIntegerArgument)? .parse::() .map_err(|_| ExprError::NonIntegerArgument), } } - pub fn eval_as_string(self) -> String { + pub fn eval_as_string(self) -> MaybeNonUtf8String { match self { - Self::Num(num) => num.to_string(), + Self::Num(num) => num.to_string().into(), Self::Str(str) => str, } } @@ -447,7 +463,7 @@ pub enum AstNodeInner { value: NumOrStr, }, Leaf { - value: String, + value: MaybeNonUtf8String, }, BinOp { op_type: BinOp, @@ -465,7 +481,7 @@ pub enum AstNodeInner { } impl AstNode { - pub fn parse(input: &[impl AsRef]) -> ExprResult { + pub fn parse(input: &[impl AsRef]) -> ExprResult { Parser::new(input).parse() } @@ -492,7 +508,7 @@ impl AstNode { result_stack.insert(node.id, Ok(value.clone())); } AstNodeInner::Leaf { value, .. } => { - result_stack.insert(node.id, Ok(value.to_string().into())); + result_stack.insert(node.id, Ok(value.to_owned().into())); } AstNodeInner::BinOp { op_type, @@ -529,7 +545,7 @@ impl AstNode { continue; }; - let string: String = string?.eval_as_string(); + let string: MaybeNonUtf8String = string?.eval_as_string(); // The GNU docs say: // @@ -550,7 +566,7 @@ impl AstNode { .unwrap_or(0); if let (Some(pos), Some(_)) = (pos.checked_sub(1), length.checked_sub(1)) { - let result = string.chars().skip(pos).take(length).collect::(); + let result = locale_aware_substr(string, pos, length); result_stack.insert(node.id, Ok(result.into())); } else { result_stack.insert(node.id, Ok(String::new().into())); @@ -565,7 +581,7 @@ impl AstNode { continue; }; - let length = string?.eval_as_string().chars().count(); + let length = locale_aware_length(&string?.eval_as_string()); result_stack.insert(node.id, Ok(length.into())); } } @@ -591,17 +607,17 @@ fn get_next_id() -> u32 { }) } -struct Parser<'a, S: AsRef> { +struct Parser<'a, S: AsRef> { input: &'a [S], index: usize, } -impl<'a, S: AsRef> Parser<'a, S> { +impl<'a, S: AsRef> Parser<'a, S> { fn new(input: &'a [S]) -> Self { Self { input, index: 0 } } - fn next(&mut self) -> ExprResult<&'a str> { + fn next(&mut self) -> ExprResult<&'a MaybeNonUtf8Str> { let next = self.input.get(self.index); if let Some(next) = next { self.index += 1; @@ -610,12 +626,12 @@ impl<'a, S: AsRef> Parser<'a, S> { // The indexing won't panic, because we know that the input size // is greater than zero. Err(ExprError::MissingArgument( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into_owned(), )) } } - fn accept(&mut self, f: impl Fn(&str) -> Option) -> Option { + fn accept(&mut self, f: impl Fn(&MaybeNonUtf8Str) -> Option) -> Option { let next = self.input.get(self.index)?; let tok = f(next.as_ref()); if let Some(tok) = tok { @@ -632,7 +648,9 @@ impl<'a, S: AsRef> Parser<'a, S> { } let res = self.parse_expression()?; if let Some(arg) = self.input.get(self.index) { - return Err(ExprError::UnexpectedArgument(arg.as_ref().into())); + return Err(ExprError::UnexpectedArgument( + String::from_utf8_lossy(arg.as_ref()).into_owned(), + )); } Ok(res) } @@ -675,7 +693,7 @@ impl<'a, S: AsRef> Parser<'a, S> { fn parse_simple_expression(&mut self) -> ExprResult { let first = self.next()?; let inner = match first { - "match" => { + b"match" => { let left = self.parse_simple_expression()?; let right = self.parse_simple_expression()?; AstNodeInner::BinOp { @@ -684,7 +702,7 @@ impl<'a, S: AsRef> Parser<'a, S> { right: Box::new(right), } } - "substr" => { + b"substr" => { let string = self.parse_simple_expression()?; let pos = self.parse_simple_expression()?; let length = self.parse_simple_expression()?; @@ -694,7 +712,7 @@ impl<'a, S: AsRef> Parser<'a, S> { length: Box::new(length), } } - "index" => { + b"index" => { let left = self.parse_simple_expression()?; let right = self.parse_simple_expression()?; AstNodeInner::BinOp { @@ -703,32 +721,32 @@ impl<'a, S: AsRef> Parser<'a, S> { right: Box::new(right), } } - "length" => { + b"length" => { let string = self.parse_simple_expression()?; AstNodeInner::Length { string: Box::new(string), } } - "+" => AstNodeInner::Leaf { + b"+" => AstNodeInner::Leaf { value: self.next()?.into(), }, - "(" => { + b"(" => { // Evaluate the node just after parsing to we detect arithmetic // errors before checking for the closing parenthesis. let s = self.parse_expression()?.evaluated()?; match self.next() { - Ok(")") => {} + Ok(b")") => {} // Since we have parsed at least a '(', there will be a token // at `self.index - 1`. So this indexing won't panic. Ok(_) => { return Err(ExprError::ExpectedClosingBraceInsteadOf( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(), )); } Err(ExprError::MissingArgument(_)) => { return Err(ExprError::ExpectedClosingBraceAfter( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(), )); } Err(e) => return Err(e), @@ -752,11 +770,11 @@ pub fn is_truthy(s: &NumOrStr) -> bool { NumOrStr::Num(num) => num != &BigInt::from(0), NumOrStr::Str(str) => { // Edge case: `-` followed by nothing is truthy - if str == "-" { + if str == b"-" { return true; } - let mut bytes = str.bytes(); + let mut bytes = str.iter().copied(); // Empty string is falsy let Some(first) = bytes.next() else { @@ -922,7 +940,7 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), ""); + assert_eq!(result.eval_as_string(), b""); } #[test] @@ -931,13 +949,13 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "0"); + assert_eq!(result.eval_as_string(), b"0"); let result = AstNode::parse(&["*cats", ":", r"*cats"]) .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "5"); + assert_eq!(result.eval_as_string(), b"5"); } #[test] @@ -946,7 +964,7 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "0"); + assert_eq!(result.eval_as_string(), b"0"); } #[test] diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index 6ff74992c48..9a37f22faf8 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -27,10 +27,6 @@ dns-lookup = { workspace = true, optional = true } dunce = { version = "1.0.4", optional = true } wild = "2.2.1" glob = { workspace = true, optional = true } -icu_collator = { workspace = true, optional = true, features = [ - "compiled_data", -] } -icu_locale = { workspace = true, optional = true, features = ["compiled_data"] } itertools = { workspace = true, optional = true } time = { workspace = true, optional = true, features = [ "formatting", @@ -59,6 +55,16 @@ bigdecimal = { workspace = true, optional = true } num-traits = { workspace = true, optional = true } selinux = { workspace = true, optional = true } +# icu stuff +icu_collator = { workspace = true, optional = true, features = [ + "compiled_data", +] } +icu_decimal = { workspace = true, optional = true, features = [ + "compiled_data", +] } +icu_locale = { workspace = true, optional = true, features = ["compiled_data"] } +icu_provider = { workspace = true, optional = true } + # Fluent dependencies fluent = { workspace = true } fluent-syntax = { workspace = true } @@ -108,7 +114,10 @@ format = [ "num-traits", "quoting-style", ] -i18n = ["icu_collator", "icu_locale"] +i18n-all = ["i18n-collator", "i18n-decimal"] +i18n-common = ["icu_locale"] +i18n-collator = ["i18n-common", "icu_collator"] +i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"] mode = ["libc"] perms = ["entries", "libc", "walkdir"] buf-copy = [] @@ -116,7 +125,7 @@ parser = ["extendedbigdecimal", "glob", "num-traits"] pipes = [] process = ["libc"] proc-info = ["tty", "walkdir"] -quoting-style = ["i18n"] +quoting-style = ["i18n-common"] ranges = [] ringbuffer = [] selinux = ["dep:selinux"] diff --git a/src/uucore/src/lib/features.rs b/src/uucore/src/lib/features.rs index fcc97b0f00a..3a622cd6857 100644 --- a/src/uucore/src/lib/features.rs +++ b/src/uucore/src/lib/features.rs @@ -26,7 +26,7 @@ pub mod format; pub mod fs; #[cfg(feature = "fsext")] pub mod fsext; -#[cfg(feature = "i18n")] +#[cfg(feature = "i18n-common")] pub mod i18n; #[cfg(feature = "lines")] pub mod lines; diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs new file mode 100644 index 00000000000..fda8cd6e093 --- /dev/null +++ b/src/uucore/src/lib/features/i18n/collator.rs @@ -0,0 +1,44 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::{cmp::Ordering, sync::OnceLock}; + +use icu_collator::{self, CollatorBorrowed}; + +use crate::i18n::{DEFAULT_LOCALE, get_collating_locale}; + +pub use icu_collator::options::{ + AlternateHandling, CaseLevel, CollatorOptions, MaxVariable, Strength, +}; + +static COLLATOR: OnceLock = OnceLock::new(); + +/// Will initialize the collator if not already initialized. +/// returns `true` if initialization happened +pub fn try_init_collator(opts: CollatorOptions) -> bool { + COLLATOR + .set(CollatorBorrowed::try_new(get_collating_locale().0.clone().into(), opts).unwrap()) + .is_ok() +} + +/// Will initialize the collator and panic if already initialized. +pub fn init_collator(opts: CollatorOptions) { + COLLATOR + .set(CollatorBorrowed::try_new(get_collating_locale().0.clone().into(), opts).unwrap()) + .expect("Collator already initialized"); +} + +/// Compare both strings with regard to the current locale. +pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering { + // If the detected locale is 'C', just do byte-wise comparison + if get_collating_locale().0 == DEFAULT_LOCALE { + left.cmp(right) + } else { + COLLATOR + .get() + .expect("Collator was not initialized") + .compare_utf8(left, right) + } +} diff --git a/src/uucore/src/lib/features/i18n/decimal.rs b/src/uucore/src/lib/features/i18n/decimal.rs new file mode 100644 index 00000000000..9fa2d8d7bc7 --- /dev/null +++ b/src/uucore/src/lib/features/i18n/decimal.rs @@ -0,0 +1,51 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::sync::OnceLock; + +use icu_decimal::provider::DecimalSymbolsV1; +use icu_locale::Locale; +use icu_provider::prelude::*; + +use crate::i18n::get_numeric_locale; + +/// Return the decimal separator for the given locale +fn get_decimal_separator(loc: Locale) -> String { + let data_locale = DataLocale::from(loc); + + let request = DataRequest { + id: DataIdentifierBorrowed::for_locale(&data_locale), + metadata: DataRequestMetadata::default(), + }; + + let response: DataResponse = + icu_decimal::provider::Baked.load(request).unwrap(); + + response.payload.get().decimal_separator().to_string() +} + +/// Return the decimal separator from the language we're working with. +/// Example: +/// Say we need to format 1000.5 +/// en_US: 1,000.5 -> decimal separator is '.' +/// fr_FR: 1 000,5 -> decimal separator is ',' +pub fn locale_decimal_separator() -> &'static str { + static DECIMAL_SEP: OnceLock = OnceLock::new(); + + DECIMAL_SEP.get_or_init(|| get_decimal_separator(get_numeric_locale().0.clone())) +} + +#[cfg(test)] +mod tests { + use icu_locale::locale; + + use super::get_decimal_separator; + + #[test] + fn test_simple_separator() { + assert_eq!(get_decimal_separator(locale!("en")), "."); + assert_eq!(get_decimal_separator(locale!("fr")), ","); + } +} diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index 5a7cf8ea3f6..c42d41c7ea1 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -1,7 +1,17 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + use std::sync::OnceLock; use icu_locale::{Locale, locale}; +#[cfg(feature = "i18n-collator")] +pub mod collator; +#[cfg(feature = "i18n-decimal")] +pub mod decimal; + /// The encoding specified by the locale, if specified /// Currently only supports ASCII and UTF-8 for the sake of simplicity. #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -12,48 +22,59 @@ pub enum UEncoding { const DEFAULT_LOCALE: Locale = locale!("en-US-posix"); -/// Deduce the locale from the current environment -fn get_collating_locale() -> &'static (Locale, UEncoding) { - static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); +/// Look at 3 environment variables in the following order +/// +/// 1. LC_ALL +/// 2. `locale_name` +/// 3. LANG +/// +/// Or fallback on Posix locale, with ASCII encoding. +fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { + let locale_var = ["LC_ALL", locale_name, "LANG"] + .iter() + .find_map(|&key| std::env::var(key).ok()); - COLLATING_LOCALE.get_or_init(|| { - // Look at 3 environment variables in the following order - // - // 1. LC_ALL - // 2. LC_COLLATE - // 3. LANG - // - // Or fallback on Posix locale, with ASCII encoding. - - let locale_var = std::env::var("LC_ALL") - .or_else(|_| std::env::var("LC_COLLATE")) - .or_else(|_| std::env::var("LANG")); - - if let Ok(locale_var_str) = locale_var { - let mut split = locale_var_str.split(&['.', '@']); - - if let Some(simple) = split.next() { - let bcp47 = simple.replace("_", "-"); - let locale = Locale::try_from_str(&bcp47).unwrap_or(DEFAULT_LOCALE); - - // If locale parsing failed, parse the encoding part of the - // locale. Treat the special case of the given locale being "C" - // which becomes the default locale. - let encoding = if (locale != DEFAULT_LOCALE || bcp47 == "C") - && split.next() == Some("UTF-8") - { - UEncoding::Utf8 - } else { - UEncoding::Ascii - }; - return (locale, encoding); + if let Some(locale_var_str) = locale_var { + let mut split = locale_var_str.split(&['.', '@']); + + if let Some(simple) = split.next() { + // Naively convert the locale name to BCP47 tag format. + // + // See https://en.wikipedia.org/wiki/IETF_language_tag + let bcp47 = simple.replace("_", "-"); + let locale = Locale::try_from_str(&bcp47).unwrap_or(DEFAULT_LOCALE); + + // If locale parsing failed, parse the encoding part of the + // locale. Treat the special case of the given locale being "C" + // which becomes the default locale. + let encoding = if (locale != DEFAULT_LOCALE || bcp47 == "C") + && split + .next() + .is_some_and(|enc| enc.to_lowercase() == "utf-8") + { + UEncoding::Utf8 } else { - return (DEFAULT_LOCALE, UEncoding::Ascii); + UEncoding::Ascii }; + return (locale, encoding); } - // Default POSIX locale representing LC_ALL=C - (DEFAULT_LOCALE, UEncoding::Ascii) - }) + } + // Default POSIX locale representing LC_ALL=C + (DEFAULT_LOCALE, UEncoding::Ascii) +} + +/// Get the collating locale from the environment +fn get_collating_locale() -> &'static (Locale, UEncoding) { + static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); + + COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE")) +} + +/// Get the numeric locale from the environment +pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { + static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); + + NUMERIC_LOCALE.get_or_init(|| get_locale_from_env("LC_NUMERIC")) } /// Return the encoding deduced from the locale environment variable. diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 6a137b78728..8059dac9355 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -51,7 +51,7 @@ pub use crate::features::fast_inc; pub use crate::features::format; #[cfg(feature = "fs")] pub use crate::features::fs; -#[cfg(feature = "i18n")] +#[cfg(feature = "i18n-common")] pub use crate::features::i18n; #[cfg(feature = "lines")] pub use crate::features::lines; @@ -124,6 +124,7 @@ use std::iter; #[cfg(unix)] use std::os::unix::ffi::{OsStrExt, OsStringExt}; use std::str; +use std::str::Utf8Chunk; use std::sync::{LazyLock, atomic::Ordering}; /// Disables the custom signal handlers installed by Rust for stack-overflow handling. With those custom signal handlers processes ignore the first SIGBUS and SIGSEGV signal they receive. @@ -377,6 +378,24 @@ pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { Ok(s) } +/// Converts an `OsString` into a `Vec`, parsing as UTF-8 on non-unix platforms. +/// +/// This always succeeds on unix platforms, +/// and fails on other platforms if the bytes can't be parsed as UTF-8. +pub fn os_string_to_vec(s: OsString) -> mods::error::UResult> { + #[cfg(unix)] + let v = s.into_vec(); + #[cfg(not(unix))] + let v = s + .into_string() + .map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })? + .into(); + + Ok(v) +} + /// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, /// which avoids panicking on non UTF-8 input. pub fn read_byte_lines( @@ -443,6 +462,91 @@ macro_rules! prompt_yes( }) ); +/// Represent either a character or a byte. +/// Used to iterate on partially valid UTF-8 data +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CharByte { + Char(char), + Byte(u8), +} + +impl From for CharByte { + fn from(value: char) -> Self { + CharByte::Char(value) + } +} + +impl From for CharByte { + fn from(value: u8) -> Self { + CharByte::Byte(value) + } +} + +impl From<&u8> for CharByte { + fn from(value: &u8) -> Self { + CharByte::Byte(*value) + } +} + +struct Utf8ChunkIterator<'a> { + iter: Box + 'a>, +} + +impl Iterator for Utf8ChunkIterator<'_> { + type Item = CharByte; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +impl<'a> From> for Utf8ChunkIterator<'a> { + fn from(chk: Utf8Chunk<'a>) -> Utf8ChunkIterator<'a> { + Self { + iter: Box::new( + chk.valid() + .chars() + .map(CharByte::from) + .chain(chk.invalid().iter().map(CharByte::from)), + ), + } + } +} + +/// Iterates on the valid and invalid parts of a byte sequence with regard to +/// the UTF-8 encoding. +pub struct CharByteIterator<'a> { + iter: Box + 'a>, +} + +impl<'a> CharByteIterator<'a> { + /// Make a `CharByteIterator` from a byte slice. + /// [`CharByteIterator`] + pub fn new(input: &'a [u8]) -> CharByteIterator<'a> { + Self { + iter: Box::new(input.utf8_chunks().flat_map(Utf8ChunkIterator::from)), + } + } +} + +impl Iterator for CharByteIterator<'_> { + type Item = CharByte; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +pub trait IntoCharByteIterator<'a> { + fn iter_char_bytes(self) -> CharByteIterator<'a>; +} + +impl<'a> IntoCharByteIterator<'a> for &'a [u8] { + fn iter_char_bytes(self) -> CharByteIterator<'a> { + CharByteIterator::new(self) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/tests/by-util/test_expr.rs b/tests/by-util/test_expr.rs index 55294dedaef..729b9129019 100644 --- a/tests/by-util/test_expr.rs +++ b/tests/by-util/test_expr.rs @@ -5,7 +5,7 @@ // spell-checker:ignore αbcdef ; (people) kkos // spell-checker:ignore aabcccd aabcd aabd abbb abbbd abbcabc abbcac abbcbbbd abbcbd // spell-checker:ignore abbccd abcabc abcac acabc andand bigcmp bignum emptysub -// spell-checker:ignore orempty oror +// spell-checker:ignore orempty oror bcdef fedcb use uutests::new_ucmd; @@ -207,43 +207,6 @@ fn test_and() { new_ucmd!().args(&["", "&", ""]).fails().stdout_only("0\n"); } -#[test] -fn test_index() { - new_ucmd!() - .args(&["index", "αbcdef", "x"]) - .fails_with_code(1) - .stdout_only("0\n"); - new_ucmd!() - .args(&["index", "αbcdef", "α"]) - .succeeds() - .stdout_only("1\n"); - new_ucmd!() - .args(&["index", "αbc_δef", "δ"]) - .succeeds() - .stdout_only("5\n"); - new_ucmd!() - .args(&["index", "αbc_δef", "δf"]) - .succeeds() - .stdout_only("5\n"); - new_ucmd!() - .args(&["index", "αbcdef", "fb"]) - .succeeds() - .stdout_only("2\n"); - new_ucmd!() - .args(&["index", "αbcdef", "f"]) - .succeeds() - .stdout_only("6\n"); - new_ucmd!() - .args(&["index", "αbcdef_f", "f"]) - .succeeds() - .stdout_only("6\n"); - - new_ucmd!() - .args(&["αbcdef", "index", "α"]) - .fails_with_code(2) - .stderr_only("expr: syntax error: unexpected argument 'index'\n"); -} - #[test] fn test_length_fail() { new_ucmd!().args(&["length", "αbcdef", "1"]).fails(); @@ -262,14 +225,6 @@ fn test_length() { .stderr_only("expr: syntax error: unexpected argument 'length'\n"); } -#[test] -fn test_length_mb() { - new_ucmd!() - .args(&["length", "αbcdef"]) - .succeeds() - .stdout_only("6\n"); -} - #[test] fn test_regex_empty() { new_ucmd!().args(&["", ":", ""]).fails().stdout_only("0\n"); @@ -1504,3 +1459,471 @@ mod gnu_expr { .stderr_contains("syntax error: expecting ')' instead of 'a'"); } } + +/// Test that `expr` correctly detects and handles locales +mod locale_aware { + use uutests::new_ucmd; + + #[test] + fn test_expr_collating() { + for (loc, code, output) in [ + ("C", 0, "1\n"), + ("fr_FR.UTF-8", 1, "0\n"), + ("fr_FR.utf-8", 1, "0\n"), + ("en_US", 1, "0\n"), + ] { + new_ucmd!() + .args(&["50n", ">", "-51"]) + .env("LC_ALL", loc) + .run() + .code_is(code) + .stdout_only(output); + } + } +} + +/// This module reimplements the expr-multibyte.pl test +#[cfg(target_os = "linux")] +mod gnu_expr_multibyte { + use uutests::new_ucmd; + + use uucore::os_str_from_bytes; + + trait AsByteSlice<'a> { + fn into_bytes(self) -> &'a [u8]; + } + + impl<'a> AsByteSlice<'a> for &'a str { + fn into_bytes(self) -> &'a [u8] { + self.as_bytes() + } + } + + impl<'a> AsByteSlice<'a> for &'a [u8] { + fn into_bytes(self) -> &'a [u8] { + self + } + } + + impl<'a, const N: usize> AsByteSlice<'a> for &'a [u8; N] { + fn into_bytes(self) -> &'a [u8] { + self + } + } + + const EXPRESSION: &[u8] = + "\u{1F14}\u{03BA}\u{03C6}\u{03C1}\u{03B1}\u{03C3}\u{03B9}\u{03C2}".as_bytes(); + + #[derive(Debug, Default, Clone, Copy)] + struct TestCase { + pub locale: &'static str, + pub out: Option<&'static [u8]>, + pub code: i32, + } + + impl TestCase { + const FR: Self = Self::new("fr_FR.UTF-8"); + const C: Self = Self::new("C"); + + const fn new(locale: &'static str) -> Self { + Self { + locale, + out: None, + code: 0, + } + } + + fn out(mut self, out: impl AsByteSlice<'static>) -> Self { + self.out = Some(out.into_bytes()); + self + } + + fn code(mut self, code: i32) -> Self { + self.code = code; + self + } + } + + fn check_test_case(args: &[&[u8]], tc: &TestCase) { + let args = args + .iter() + .map(|arg: &&[u8]| os_str_from_bytes(arg).unwrap()) + .collect::>(); + + let res = new_ucmd!().env("LC_ALL", tc.locale).args(&args).run(); + + res.code_is(tc.code); + + if let Some(out) = tc.out { + let mut out = out.to_owned(); + out.push(b'\n'); + res.stdout_is_bytes(&out); + } else { + res.no_stdout(); + } + } + + // LENGTH EXPRESSIONS + + // sanity check + #[test] + fn test_l1() { + let args: &[&[u8]] = &[b"length", b"abcdef"]; + + let cases = &[TestCase::FR.out("6"), TestCase::C.out("6")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // A single multibyte character in the beginning of the string \xCE\xB1 is + // UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA" + #[test] + fn test_l2() { + let args: &[&[u8]] = &[b"length", b"\xCE\xB1bcdef"]; + + let cases = &[TestCase::FR.out("6"), TestCase::C.out("7")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // A single multibyte character in the middle of the string \xCE\xB4 is + // UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA" + #[test] + fn test_l3() { + let args: &[&[u8]] = &[b"length", b"abc\xCE\xB4ef"]; + + let cases = &[TestCase::FR.out("6"), TestCase::C.out("7")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // A single multibyte character in the end of the string + #[test] + fn test_l4() { + let args: &[&[u8]] = &[b"length", b"fedcb\xCE\xB1"]; + + let cases = &[TestCase::FR.out("6"), TestCase::C.out("7")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // A invalid multibyte sequence + #[test] + fn test_l5() { + let args: &[&[u8]] = &[b"length", b"\xB1aaa"]; + + let cases = &[TestCase::FR.out("4"), TestCase::C.out("4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // An incomplete multibyte sequence at the end of the string + #[test] + fn test_l6() { + let args: &[&[u8]] = &[b"length", b"aaa\xCE"]; + + let cases = &[TestCase::FR.out("4"), TestCase::C.out("4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // An incomplete multibyte sequence at the end of the string + #[test] + fn test_l7() { + let args: &[&[u8]] = &[b"length", EXPRESSION]; + + let cases = &[TestCase::FR.out("8"), TestCase::C.out("17")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // INDEX EXPRESSIONS + + // sanity check + #[test] + fn test_i1() { + let args: &[&[u8]] = &[b"index", b"abcdef", b"fb"]; + + let cases = &[TestCase::FR.out("2"), TestCase::C.out("2")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Search for a single-octet + #[test] + fn test_i2() { + let args: &[&[u8]] = &[b"index", b"\xCE\xB1bc\xCE\xB4ef", b"b"]; + + let cases = &[TestCase::FR.out("2"), TestCase::C.out("3")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_i3() { + let args: &[&[u8]] = &[b"index", b"\xCE\xB1bc\xCE\xB4ef", b"f"]; + + let cases = &[TestCase::FR.out("6"), TestCase::C.out("8")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Search for multibyte character. + // In the C locale, the search string is treated as two octets. + // the first of them (\xCE) matches the first octet of the input string. + #[test] + fn test_i4() { + let args: &[&[u8]] = &[b"index", b"\xCE\xB1bc\xCE\xB4ef", b"\xCE\xB4"]; + + let cases = &[TestCase::FR.out("4"), TestCase::C.out("1")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Invalid multibyte sequence in the input string, treated as a single + // octet. + #[test] + fn test_i5() { + let args: &[&[u8]] = &[b"index", b"\xCEbc\xCE\xB4ef", b"\xCE\xB4"]; + + let cases = &[TestCase::FR.out("4"), TestCase::C.out("1")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Invalid multibyte sequence in the search string, treated as a single + // octet. In multibyte locale, there should be no match, expr returns and + // prints zero, and terminates with exit-code 1 (as per POSIX). + #[test] + fn test_i6() { + let args: &[&[u8]] = &[b"index", b"\xCE\xB1bc\xCE\xB4ef", b"\xB4"]; + + let cases = &[TestCase::FR.out("0").code(1), TestCase::C.out("6")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Edge-case: invalid multibyte sequence BOTH in the input string and in + // the search string: expr should find a match. + #[test] + fn test_i7() { + let args: &[&[u8]] = &[b"index", b"\xCE\xB1bc\xB4ef", b"\xB4"]; + + let cases = &[TestCase::FR.out("4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // SUBSTR EXPRESSIONS + + // sanity check + #[test] + fn test_s1() { + let args: &[&[u8]] = &[b"substr", b"abcdef", b"2", b"3"]; + + let cases = &[TestCase::FR.out("bcd"), TestCase::C.out("bcd")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s2() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"1", b"1"]; + + let cases = &[TestCase::FR.out(b"\xCE\xB1"), TestCase::C.out(b"\xCE")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s3() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"3", b"2"]; + + let cases = &[TestCase::FR.out(b"c\xCE\xB4"), TestCase::C.out("bc")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s4() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"4", b"1"]; + + let cases = &[TestCase::FR.out(b"\xCE\xB4"), TestCase::C.out("c")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s5() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"4", b"2"]; + + let cases = &[TestCase::FR.out(b"\xCE\xB4e"), TestCase::C.out(b"c\xCE")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s6() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"6", b"1"]; + + let cases = &[TestCase::FR.out(b"f"), TestCase::C.out(b"\xB4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s7() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xCE\xB4ef", b"7", b"1"]; + + let cases = &[TestCase::FR.out("").code(1), TestCase::C.out(b"e")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + #[test] + fn test_s8() { + let args: &[&[u8]] = &[b"substr", b"\xCE\xB1bc\xB4ef", b"3", b"3"]; + + let cases = &[TestCase::FR.out(b"c\xB4e"), TestCase::C.out(b"bc\xB4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // MATCH EXPRESSIONS + + // sanity check + #[test] + fn test_m1() { + let args: &[&[u8]] = &[b"match", b"abcdef", b"ab"]; + + let cases = &[TestCase::FR.out("2"), TestCase::C.out("2")]; + + for tc in cases { + check_test_case(args, tc); + } + } + #[test] + fn test_m2() { + let args: &[&[u8]] = &[b"match", b"abcdef", b"\\(ab\\)"]; + + let cases = &[TestCase::FR.out("ab"), TestCase::C.out("ab")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // The regex engine should match the '.' to the first multibyte character. + #[test] + #[ignore = "not implemented"] + fn test_m3() { + let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b".bc"]; + + let cases = &[TestCase::FR.out("3"), TestCase::C.code(1)]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // The opposite of the previous test: two dots should only match the two + // octets in single-byte locale. + #[test] + #[ignore = "not implemented"] + fn test_m4() { + let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"..bc"]; + + let cases = &[TestCase::FR.out("0").code(1), TestCase::C.out("4")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Match with grouping - a single dot should return the two octets + #[test] + #[ignore = "not implemented"] + fn test_m5() { + let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.b\\)c"]; + + let cases = &[TestCase::FR.out(b"\xCE\xB1b"), TestCase::C.code(1)]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Invalid multibyte sequences - regex should not match in multibyte locale + // (POSIX requirement) + #[test] + #[ignore = "not implemented"] + fn test_m6() { + let args: &[&[u8]] = &[b"match", b"\xCEbc\xCE\xB4ef", b"\\(.\\)"]; + + let cases = &[TestCase::FR.code(1), TestCase::C.out(b"\xCE")]; + + for tc in cases { + check_test_case(args, tc); + } + } + + // Character classes: in the multibyte case, the regex engine understands + // there is a single multibyte character in the brackets. + // In the single byte case, the regex engine sees two octets in the + // character class ('\xCE' and '\xB1') - and it matches the first one. + #[test] + #[ignore = "not implemented"] + fn test_m7() { + let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.\\)"]; + + let cases = &[TestCase::FR.out(b"\xCE\xB1"), TestCase::C.out(b"\xCE")]; + + for tc in cases { + check_test_case(args, tc); + } + } +}