diff --git a/newsfragments/5444.fixed.md b/newsfragments/5444.fixed.md new file mode 100644 index 00000000000..86055f8f1be --- /dev/null +++ b/newsfragments/5444.fixed.md @@ -0,0 +1 @@ +fix `OsStr` conversion for non-utf8 strings on windows diff --git a/src/conversions/std/osstr.rs b/src/conversions/std/osstr.rs index 3c480526d83..eb132a58f69 100644 --- a/src/conversions/std/osstr.rs +++ b/src/conversions/std/osstr.rs @@ -110,6 +110,12 @@ impl FromPyObject<'_, '_> for OsString { unsafe { ffi::PyUnicode_AsWideChar(pystring.as_ptr(), std::ptr::null_mut(), 0) }; crate::err::error_on_minusone(ob.py(), size)?; + debug_assert!( + size > 0, + "PyUnicode_AsWideChar should return at least 1 for null terminator" + ); + let size = size - 1; // exclude null terminator + let mut buffer = vec![0; size as usize]; let bytes_read = unsafe { ffi::PyUnicode_AsWideChar(pystring.as_ptr(), buffer.as_mut_ptr(), size) }; @@ -169,7 +175,7 @@ impl<'py> IntoPyObject<'py> for &OsString { #[cfg(test)] mod tests { - use crate::types::{PyString, PyStringMethods}; + use crate::types::{PyAnyMethods, PyString, PyStringMethods}; use crate::{BoundObject, IntoPyObject, Python}; use std::fmt::Debug; use std::{ @@ -181,7 +187,6 @@ mod tests { #[cfg(not(windows))] fn test_non_utf8_conversion() { Python::attach(|py| { - use crate::types::PyAnyMethods; #[cfg(not(target_os = "wasi"))] use std::os::unix::ffi::OsStrExt; #[cfg(target_os = "wasi")] @@ -219,4 +224,32 @@ mod tests { test_roundtrip::(py, os_str.to_os_string()); }); } + + #[test] + #[cfg(windows)] + fn test_windows_non_utf8_osstring_roundtrip() { + use std::os::windows::ffi::{OsStrExt, OsStringExt}; + + Python::attach(|py| { + // Example: Unpaired surrogate (0xD800) is not valid UTF-8, but valid in Windows OsString + let wide: &[u16] = &['A' as u16, 0xD800, 'B' as u16]; // 'A', unpaired surrogate, 'B' + let os_str = OsString::from_wide(wide); + + assert_eq!(os_str.to_string_lossy(), "A�B"); + + // This cannot be represented as UTF-8, so .to_str() would return None + assert!(os_str.to_str().is_none()); + + // Convert to Python and back + let py_str = os_str.as_os_str().into_pyobject(py).unwrap(); + let os_str_2 = py_str.extract::().unwrap(); + + // The roundtrip should preserve the original wide data + assert_eq!(os_str, os_str_2); + + // Show that encode_wide is necessary: direct UTF-8 conversion would lose information + let encoded: Vec = os_str.encode_wide().collect(); + assert_eq!(encoded, wide); + }); + } }