uutils · cakebaker · Jul 9, 2025 · Jun 30, 2025 · Jun 30, 2025 · Jul 3, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -314,7 +314,9 @@ glob = "0.3.1"
 half = "2.4.1"
 hostname = "0.4"
 icu_collator = "2.0.0"
+icu_decimal = "2.0.0"
 icu_locale = "2.0.0"
+icu_provider = "2.0.0"
 indicatif = "0.18.0"
 itertools = "0.14.0"
 jiff = { version = "0.2.10", default-features = false, features = [

diff --git a/src/uu/expr/Cargo.toml b/src/uu/expr/Cargo.toml
@@ -22,7 +22,7 @@ clap = { workspace = true }
 num-bigint = { workspace = true }
 num-traits = { workspace = true }
 onig = { workspace = true }
-uucore = { workspace = true }
+uucore = { workspace = true, features = ["i18n-collator"] }
 thiserror = { workspace = true }
 
 [[bin]]

diff --git a/src/uu/expr/locales/en-US.ftl b/src/uu/expr/locales/en-US.ftl
@@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Unmatched {"\\{"}
 expr-error-invalid-bracket-content = Invalid content of {"\\{\\}"}
 expr-error-trailing-backslash = Trailing backslash
 expr-error-too-big-range-quantifier-index = Regular expression too big
+expr-error-match-utf8 = match does not support invalid UTF-8 encoding in { $arg }
diff --git a/src/uu/expr/locales/fr-FR.ftl b/src/uu/expr/locales/fr-FR.ftl
@@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Accolade ouvrante {"\\{"} non appariée
 expr-error-invalid-bracket-content = Contenu invalide de {"\\{\\}"}
 expr-error-trailing-backslash = Barre oblique inverse en fin
 expr-error-too-big-range-quantifier-index = Expression régulière trop grande
+expr-error-match-utf8 = match ne supporte pas l'encodage UTF-8 invalide dans { $arg }
diff --git a/src/uu/expr/src/expr.rs b/src/uu/expr/src/expr.rs
@@ -5,15 +5,18 @@
 
 use clap::{Arg, ArgAction, Command};
 use std::collections::HashMap;
+use std::io::Write;
 use syntax_tree::{AstNode, is_truthy};
 use thiserror::Error;
 use uucore::locale::{get_message, get_message_with_args};
+use uucore::os_string_to_vec;
 use uucore::{
     display::Quotable,
     error::{UError, UResult},
     format_usage,
 };
 
+mod locale_aware;
 mod syntax_tree;
 
 mod options {
@@ -54,6 +57,8 @@ pub enum ExprError {
     TrailingBackslash,
     #[error("{}", get_message("expr-error-too-big-range-quantifier-index"))]
     TooBigRangeQuantifierIndex,
+    #[error("{}", get_message_with_args("expr-error-match-utf8", HashMap::from([("arg".to_string(), _0.quote().to_string())])))]
+    UnsupportedNonUtf8Match(String),
 }
 
 impl UError for ExprError {
@@ -98,25 +103,27 @@ pub fn uu_app() -> Command {
 pub fn uumain(args: impl uucore::Args) -> UResult<()> {
     // For expr utility we do not want getopts.
     // The following usage should work without escaping hyphens: `expr -15 = 1 + 2 \* \( 3 - -4 \)`
-    let args: Vec<String> = args
+    let args = args
         .skip(1) // Skip binary name
-        .map(|a| a.to_string_lossy().to_string())
-        .collect();
+        .map(os_string_to_vec)
+        .collect::<Result<Vec<_>, _>>()?;
 
-    if args.len() == 1 && args[0] == "--help" {
+    if args.len() == 1 && args[0] == b"--help" {
         let _ = uu_app().print_help();
-    } else if args.len() == 1 && args[0] == "--version" {
+    } else if args.len() == 1 && args[0] == b"--version" {
         println!("{} {}", uucore::util_name(), uucore::crate_version!());
     } else {
         // The first argument may be "--" and should be be ignored.
-        let args = if !args.is_empty() && args[0] == "--" {
+        let args = if !args.is_empty() && args[0] == b"--" {
             &args[1..]
         } else {
             &args
         };
 
-        let res: String = AstNode::parse(args)?.eval()?.eval_as_string();
-        println!("{res}");
+        let res = AstNode::parse(args)?.eval()?.eval_as_string();
+        let _ = std::io::stdout().write_all(&res);
+        let _ = std::io::stdout().write_all(b"\n");
+
         if !is_truthy(&res.into()) {
             return Err(1.into());
         }

diff --git a/src/uu/expr/src/locale_aware.rs b/src/uu/expr/src/locale_aware.rs
@@ -0,0 +1,111 @@
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use std::cmp::Ordering;
+
+use uucore::{
+    CharByte, IntoCharByteIterator,
+    i18n::{
+        UEncoding,
+        collator::{AlternateHandling, CollatorOptions, locale_cmp, try_init_collator},
+        get_locale_encoding,
+    },
+};
+
+use crate::syntax_tree::{MaybeNonUtf8Str, MaybeNonUtf8String};
+
+/// Perform a locale-aware string comparison using the current locale's
+/// collator.
+pub(crate) fn locale_comparison(a: &MaybeNonUtf8Str, b: &MaybeNonUtf8Str) -> Ordering {
+    // Initialize the collator
+    let mut opts = CollatorOptions::default();
+    opts.alternate_handling = Some(AlternateHandling::Shifted); // This is black magic
+    let _ = try_init_collator(opts);
+
+    locale_cmp(a, b)
+}
+
+/// Perform an index search with an approach that differs with regard to the
+/// given locale.
+fn index_with_locale(
+    left: &MaybeNonUtf8Str,
+    right: &MaybeNonUtf8Str,
+    encoding: UEncoding,
+) -> usize {
+    match encoding {
+        UEncoding::Utf8 => {
+            // In the UTF-8 case, we try to decode the strings on the fly. We
+            // compare UTf-8 characters as long as the stream is valid, and
+            // switch to byte comparison when the byte is an invalid sequence.
+            left.iter_char_bytes()
+                .position(|ch_h| right.iter_char_bytes().any(|ch_n| ch_n == ch_h))
+                .map_or(0, |idx| idx + 1)
+        }
+        UEncoding::Ascii => {
+            // In the default case, we just perform byte-wise comparison on the
+            // arrays.
+            left.iter()
+                .position(|ch_h| right.iter().any(|ch_n| ch_n == ch_h))
+                .map_or(0, |idx| idx + 1)
+        }
+    }
+}
+
+/// Perform an index search with an approach that differs with regard to the
+/// current locale.
+pub(crate) fn locale_aware_index(left: &MaybeNonUtf8Str, right: &MaybeNonUtf8Str) -> usize {
+    index_with_locale(left, right, get_locale_encoding())
+}
+
+/// Perform a string length calculation depending on the current locale. In
+/// UTF-8 locale, it will count valid UTF-8 chars, and fallback to counting
+/// bytes otherwise. In Non UTF-8 locale, directly return input byte length.
+pub(crate) fn locale_aware_length(input: &MaybeNonUtf8Str) -> usize {
+    match get_locale_encoding() {
+        UEncoding::Utf8 => std::str::from_utf8(input).map_or(input.len(), |s| s.chars().count()),
+        UEncoding::Ascii => input.len(),
+    }
+}
+
+fn substr_with_locale(
+    s: MaybeNonUtf8String,
+    pos: usize,
+    len: usize,
+    encoding: UEncoding,
+) -> MaybeNonUtf8String {
+    match encoding {
+        UEncoding::Utf8 => {
+            // Create a buffer with the heuristic that all the chars are ASCII
+            // and are 1-byte long.
+            let mut string = MaybeNonUtf8String::with_capacity(len);
+            let mut buf = [0; 4];
+
+            // Iterate on char-bytes, and skip them accordingly.
+            // For each character (or byte) in the right range,
+            // push it to the string.
+            for cb in s.iter_char_bytes().skip(pos).take(len) {
+                match cb {
+                    CharByte::Char(c) => {
+                        let len = c.encode_utf8(&mut buf).len();
+                        string.extend(&buf[..len]);
+                    }
+                    CharByte::Byte(b) => string.push(b),
+                }
+            }
+            string
+        }
+        UEncoding::Ascii => s.into_iter().skip(pos).take(len).collect(),
+    }
+}
+
+/// Given a byte sequence, a position and a length, return the corresponding
+/// substring depending on the current locale.
+pub(crate) fn locale_aware_substr(
+    s: MaybeNonUtf8String,
+    pos: usize,
+    len: usize,
+) -> MaybeNonUtf8String {
+    substr_with_locale(s, pos, len, get_locale_encoding())
+}