Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 49 additions & 36 deletions src/uu/dd/src/conversion_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,63 @@
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore tolower toupper CTYPE

// Note: Conversion tables are just lookup tables.
// eg. The ASCII->EBCDIC table stores the EBCDIC code at the index
// obtained by treating the ASCII representation as a number.

pub type ConversionTable = [u8; 256];

pub const ASCII_UCASE_TO_LCASE: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];
/// Builds a lowercase conversion table using locale-aware libc::tolower
/// This function builds the table dynamically based on the current locale.
/// The nix crate doesn't provide safe wrappers for locale functions, so we use libc directly.
pub fn build_lcase_table() -> ConversionTable {
// Initialize locale from environment if not already done
// SAFETY: setlocale is called with a valid C string and is used to initialize
// the locale for character conversion functions
unsafe { libc::setlocale(libc::LC_CTYPE, c"".as_ptr()) };

pub const ASCII_LCASE_TO_UCASE: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];
let mut table = [0u8; 256];
for (i, item) in table.iter_mut().enumerate() {
// SAFETY: tolower is called with a valid byte value and returns a valid byte
*item = unsafe { libc::tolower(i as libc::c_int) } as u8;
}
table
}

/// Builds an uppercase conversion table using locale-aware libc::toupper
/// This function builds the table dynamically based on the current locale.
/// The nix crate doesn't provide safe wrappers for locale functions, so we use libc directly.
pub fn build_ucase_table() -> ConversionTable {
// Initialize locale from environment if not already done
// SAFETY: setlocale is called with a valid C string and is used to initialize
// the locale for character conversion functions
unsafe { libc::setlocale(libc::LC_CTYPE, c"".as_ptr()) };

let mut table = [0u8; 256];
for (i, item) in table.iter_mut().enumerate() {
// SAFETY: toupper is called with a valid byte value and returns a valid byte
*item = unsafe { libc::toupper(i as libc::c_int) } as u8;
}
table
}

/// Gets the uppercase to lowercase conversion table using current locale
pub fn get_ucase_to_lcase_table() -> &'static ConversionTable {
// For now, simply build a fresh table each time and leak it
// This is simpler and safer than trying to manage mutable static state
let table = build_lcase_table();
Box::leak(Box::new(table))
}

/// Gets the lowercase to uppercase conversion table using current locale
pub fn get_lcase_to_ucase_table() -> &'static ConversionTable {
// For now, simply build a fresh table each time and leak it
// This is simpler and safer than trying to manage mutable static state
let table = build_ucase_table();
Box::leak(Box::new(table))
}

pub const ASCII_TO_EBCDIC: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
Expand Down
4 changes: 2 additions & 2 deletions src/uu/dd/src/parseargs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,8 @@ fn get_ctable(
Conversion::Ibm => &ASCII_TO_IBM,
},
(None, Some(case)) => match case {
Case::Lower => &ASCII_UCASE_TO_LCASE,
Case::Upper => &ASCII_LCASE_TO_UCASE,
Case::Lower => get_ucase_to_lcase_table(),
Case::Upper => get_lcase_to_ucase_table(),
},
(Some(conv), Some(case)) => match (conv, case) {
(Conversion::Ascii, Case::Upper) => &EBCDIC_TO_ASCII_LCASE_TO_UCASE,
Expand Down
147 changes: 147 additions & 0 deletions tests/by-util/test_dd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1945,3 +1945,150 @@ fn test_nocache_eof_fadvise_zero_length() {
"Expected len=0 at EOF: {strace}"
);
}

#[test]
#[cfg(not(target_os = "openbsd"))]
fn test_iso8859_1_case_conversion() {
use std::process::Command;
// Test ISO-8859-1 case conversion for accented characters
// Skip test if required locale is not available (common in CI environments)
let locale_test = Command::new("locale")
.arg("-a")
.output()
.ok()
.and_then(|output| String::from_utf8(output.stdout).ok())
.is_some_and(|locales| locales.contains("fr_FR"));

if !locale_test {
eprintln!("Skipping ISO-8859-1 test: French locale not available");
return;
}

let locale = "fr_FR";

// É (0xC9) should convert to é (0xE9) with lcase
let input = vec![0xC9, 0x0A]; // É\n in ISO-8859-1
let expected = vec![0xE9, 0x0A]; // é\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds();
assert_eq!(result.stdout(), expected);

// é (0xE9) should convert to É (0xC9) with ucase
let input = vec![0xE9, 0x0A]; // é\n in ISO-8859-1
let expected = vec![0xC9, 0x0A]; // É\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds();
assert_eq!(result.stdout(), expected);
}

#[test]
fn test_locale_aware_case_conversion() {
// Test that case conversion respects different single-byte locales

// Test Turkish (ISO-8859-9) where 'I' has special behavior
// Turkish has İ (0xDD) ↔ i (0xFD) and I (0x49) ↔ ı (0xFD in some positions)
// For simplicity, test some basic accented characters that differ between locales

// Test with ISO-8859-9 (Turkish) - Ğ (0xD0) should convert to ğ (0xF0)
let input = vec![0xD0, 0x0A]; // Ğ\n in ISO-8859-9
let expected = vec![0xF0, 0x0A]; // ğ\n in ISO-8859-9
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "tr_TR.iso8859-9")
.pipe_in(input)
.succeeds();

// Note: This test may not work if the system doesn't have Turkish locale installed
// In that case, it should fall back to C locale behavior
if result.stdout() == expected {
println!("Turkish locale case conversion working correctly");
} else {
println!("Turkish locale not available, using fallback behavior");
// Test that it at least doesn't crash and produces some output
assert!(!result.stdout().is_empty());
}
}

#[test]
fn test_french_locale_case_conversion() {
// Test French (ISO-8859-1) case conversion for French accented characters
// This test uses the same charset as the previous ISO-8859-1 test but with French locale

// Test French accented characters: À (0xC0) should convert to à (0xE0) with lcase
let input = vec![0xC0, 0x0A]; // À\n in ISO-8859-1
let expected = vec![0xE0, 0x0A]; // à\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

// Note: This test may not work if the system doesn't have French locale installed
// In that case, it should fall back to C locale behavior
if result.stdout() == expected {
println!("French locale case conversion working correctly for À -> à");
} else {
println!("French locale not available, using fallback behavior");
// Test that it at least doesn't crash and produces some output
assert!(!result.stdout().is_empty());
}

// Test reverse conversion: à (0xE0) should convert to À (0xC0) with ucase
let input = vec![0xE0, 0x0A]; // à\n in ISO-8859-1
let expected = vec![0xC0, 0x0A]; // À\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

if result.stdout() == expected {
println!("French locale case conversion working correctly for à -> À");
} else {
println!("French locale not available for reverse conversion, using fallback behavior");
assert!(!result.stdout().is_empty());
}

// Test another French character: Ç (0xC7) should convert to ç (0xE7) with lcase
let input = vec![0xC7, 0x0A]; // Ç\n in ISO-8859-1
let expected = vec![0xE7, 0x0A]; // ç\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

if result.stdout() == expected {
println!("French locale case conversion working correctly for Ç -> ç");
} else {
println!("French locale not available for Ç conversion, using fallback behavior");
assert!(!result.stdout().is_empty());
}
}

#[test]
fn test_ascii_case_conversion_fallback() {
// Test that ASCII characters always convert correctly regardless of locale
let input = vec![b'A', b'B', b'C', 0x0A]; // ABC\n
let expected = vec![b'a', b'b', b'c', 0x0A]; // abc\n
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "C")
.pipe_in(input.clone())
.succeeds();
assert_eq!(result.stdout(), expected);

// Test reverse conversion
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", "C")
.pipe_in(expected)
.succeeds();
assert_eq!(result.stdout(), input);
}
Loading