Share prefix parser with cygwin

Berrysoft · Berrysoft · commit d86e69528c78 · 2025-06-02T23:26:09.000+08:00
diff --git a/library/std/src/sys/path/cygwin.rs b/library/std/src/sys/path/cygwin.rs
@@ -1,6 +1,6 @@
-use crate::ffi::{OsStr, OsString};
+use crate::ffi::OsString;
 use crate::os::unix::ffi::OsStringExt;
-use crate::path::{Path, PathBuf, Prefix};
+use crate::path::{Path, PathBuf};
 use crate::sys::common::small_c_string::run_path_with_cstr;
 use crate::sys::cvt;
 use crate::{io, ptr};
@@ -15,10 +15,7 @@ pub fn is_verbatim_sep(b: u8) -> bool {
     b == b'/' || b == b'\\'
 }
 
-#[inline]
-pub fn parse_prefix(_: &OsStr) -> Option<Prefix<'_>> {
-    None
-}
+pub use super::prefix::parse_prefix;
 
 pub const MAIN_SEP_STR: &str = "/";
 pub const MAIN_SEP: char = '/';
diff --git a/library/std/src/sys/path/mod.rs b/library/std/src/sys/path/mod.rs
@@ -1,5 +1,6 @@
 cfg_if::cfg_if! {
     if #[cfg(target_os = "windows")] {
+        mod prefix;
         mod windows;
         pub use windows::*;
     } else if #[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] {
@@ -13,6 +14,7 @@ cfg_if::cfg_if! {
         pub use uefi::*;
     } else if #[cfg(target_os = "cygwin")] {
         mod cygwin;
+        mod prefix;
         pub use cygwin::*;
     } else {
         mod unix;
diff --git a/library/std/src/sys/path/prefix.rs b/library/std/src/sys/path/prefix.rs
@@ -0,0 +1,175 @@
+use super::{is_sep_byte, is_verbatim_sep};
+use crate::ffi::OsStr;
+use crate::path::Prefix;
+
+struct PrefixParser<'a, const LEN: usize> {
+    path: &'a OsStr,
+    prefix: [u8; LEN],
+}
+
+impl<'a, const LEN: usize> PrefixParser<'a, LEN> {
+    #[inline]
+    fn get_prefix(path: &OsStr) -> [u8; LEN] {
+        let mut prefix = [0; LEN];
+        // SAFETY: Only ASCII characters are modified.
+        for (i, &ch) in path.as_encoded_bytes().iter().take(LEN).enumerate() {
+            prefix[i] = if ch == b'/' { b'\\' } else { ch };
+        }
+        prefix
+    }
+
+    fn new(path: &'a OsStr) -> Self {
+        Self { path, prefix: Self::get_prefix(path) }
+    }
+
+    fn as_slice(&self) -> PrefixParserSlice<'a, '_> {
+        PrefixParserSlice {
+            path: self.path,
+            prefix: &self.prefix[..LEN.min(self.path.len())],
+            index: 0,
+        }
+    }
+}
+
+struct PrefixParserSlice<'a, 'b> {
+    path: &'a OsStr,
+    prefix: &'b [u8],
+    index: usize,
+}
+
+impl<'a> PrefixParserSlice<'a, '_> {
+    fn strip_prefix(&self, prefix: &str) -> Option<Self> {
+        self.prefix[self.index..]
+            .starts_with(prefix.as_bytes())
+            .then_some(Self { index: self.index + prefix.len(), ..*self })
+    }
+
+    fn prefix_bytes(&self) -> &'a [u8] {
+        &self.path.as_encoded_bytes()[..self.index]
+    }
+
+    fn finish(self) -> &'a OsStr {
+        // SAFETY: The unsafety here stems from converting between &OsStr and
+        // &[u8] and back. This is safe to do because (1) we only look at ASCII
+        // contents of the encoding and (2) new &OsStr values are produced only
+        // from ASCII-bounded slices of existing &OsStr values.
+        unsafe { OsStr::from_encoded_bytes_unchecked(&self.path.as_encoded_bytes()[self.index..]) }
+    }
+}
+
+pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
+    use Prefix::{DeviceNS, Disk, UNC, Verbatim, VerbatimDisk, VerbatimUNC};
+
+    let parser = PrefixParser::<8>::new(path);
+    let parser = parser.as_slice();
+    if let Some(parser) = parser.strip_prefix(r"\\") {
+        // \\
+
+        // The meaning of verbatim paths can change when they use a different
+        // separator.
+        if let Some(parser) = parser.strip_prefix(r"?\")
+            // Cygwin allows `/` in verbatim paths.
+            && (cfg!(target_os = "cygwin") || !parser.prefix_bytes().iter().any(|&x| x == b'/'))
+        {
+            // \\?\
+            if let Some(parser) = parser.strip_prefix(r"UNC\") {
+                // \\?\UNC\server\share
+
+                let path = parser.finish();
+                let (server, path) = parse_next_component(path, true);
+                let (share, _) = parse_next_component(path, true);
+
+                Some(VerbatimUNC(server, share))
+            } else {
+                let path = parser.finish();
+
+                // in verbatim paths only recognize an exact drive prefix
+                if let Some(drive) = parse_drive_exact(path) {
+                    // \\?\C:
+                    Some(VerbatimDisk(drive))
+                } else {
+                    // \\?\prefix
+                    let (prefix, _) = parse_next_component(path, true);
+                    Some(Verbatim(prefix))
+                }
+            }
+        } else if let Some(parser) = parser.strip_prefix(r".\") {
+            // \\.\COM42
+            let path = parser.finish();
+            let (prefix, _) = parse_next_component(path, false);
+            Some(DeviceNS(prefix))
+        } else {
+            let path = parser.finish();
+            let (server, path) = parse_next_component(path, false);
+            let (share, _) = parse_next_component(path, false);
+
+            if !server.is_empty() && !share.is_empty() {
+                // \\server\share
+                Some(UNC(server, share))
+            } else {
+                // no valid prefix beginning with "\\" recognized
+                None
+            }
+        }
+    } else {
+        // If it has a drive like `C:` then it's a disk.
+        // Otherwise there is no prefix.
+        parse_drive(path).map(Disk)
+    }
+}
+
+// Parses a drive prefix, e.g. "C:" and "C:\whatever"
+fn parse_drive(path: &OsStr) -> Option<u8> {
+    // In most DOS systems, it is not possible to have more than 26 drive letters.
+    // See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
+    fn is_valid_drive_letter(drive: &u8) -> bool {
+        drive.is_ascii_alphabetic()
+    }
+
+    match path.as_encoded_bytes() {
+        [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
+        _ => None,
+    }
+}
+
+// Parses a drive prefix exactly, e.g. "C:"
+fn parse_drive_exact(path: &OsStr) -> Option<u8> {
+    // only parse two bytes: the drive letter and the drive separator
+    if path.as_encoded_bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
+        parse_drive(path)
+    } else {
+        None
+    }
+}
+
+// Parse the next path component.
+//
+// Returns the next component and the rest of the path excluding the component and separator.
+// Does not recognize `/` as a separator character on Windows if `verbatim` is true.
+fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
+    let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
+
+    match path.as_encoded_bytes().iter().position(|&x| separator(x)) {
+        Some(separator_start) => {
+            let separator_end = separator_start + 1;
+
+            let component = &path.as_encoded_bytes()[..separator_start];
+
+            // Panic safe
+            // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
+            let path = &path.as_encoded_bytes()[separator_end..];
+
+            // SAFETY: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
+            // is encoded in a single byte, therefore `bytes[separator_start]` and
+            // `bytes[separator_end]` must be code point boundaries and thus
+            // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
+            unsafe {
+                (
+                    OsStr::from_encoded_bytes_unchecked(component),
+                    OsStr::from_encoded_bytes_unchecked(path),
+                )
+            }
+        }
+        None => (path, OsStr::new("")),
+    }
+}
diff --git a/library/std/src/sys/path/windows.rs b/library/std/src/sys/path/windows.rs