Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
not a leak) but tools that search for file descriptor leaks (such as runc's
test suite) could incorrectly classify this as a leak. We now close this
`ProcfsBase` handle far more aggressively.
- RHEL 8 kernels have backports of the fd-based mount API (`fsopen(2)`,
`open_tree(2)`, et al.) but some `runc` testing found that they have very bad
(and very difficult to debug) performance issues. Thus, to avoid broken
backports libpathrs will now explicitly refuse to use the fd-based mount API
if the reported kernel version is pre-5.2 and will instead fallback to the
less-secure `open("/proc")`.
- libpathrs [0.2.0][] added some `fdinfo`-based hardening to the procfs
resolver when `openat2` is not available. Unfortunately, one aspect of this
hardening had a hard requirement on [a kernel feature only added in Linux
5.14][kcommit-3845f256a8b52] (namely the `ino` field in `fdinfo`) and thus
inadvertently increased our minimum kernel version requirement quite
significantly. This additional hardening is now only treated as mandatory if
the host kernel version is Linux 5.14 or newer.

[rust-issue20267]: https://github.com/rust-lang/rust/issues/20267
[kcommit-3845f256a8b52]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3845f256a8b527127bfbd4ced21e93d9e89aa6d7

## [0.2.3] - 2026-01-29 ##

Expand Down
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ once_cell = "1"
# MSRV(1.65): Update to >=0.4.1 which uses let_else. 0.4.0 was broken.
open-enum = { version = "0.3", optional = true }
rand = { version = "0.9", optional = true }
rustix = { version = "1.1", features = ["fs", "process", "thread", "mount"] }
rustix = { version = "1.1", features = ["fs", "process", "system", "thread", "mount"] }
rustversion = "1"
thiserror = "2"
static_assertions = "1.1"
Expand All @@ -89,6 +89,7 @@ tempfile = "3"
paste = "1"
path-clean = "1"
pretty_assertions = { version = "1.4.1", features = ["unstable"] }
scopeguard = "1"

[build-dependencies]
tempfile = "3"
Expand Down
4 changes: 2 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub(crate) enum ErrorImpl {
#[error("feature {feature} is not implemented")]
NotImplemented { feature: Cow<'static, str> },

#[error("feature {feature} not supported on this kernel")]
#[error("feature {feature} not supported by the system")]
NotSupported { feature: Cow<'static, str> },

#[error("invalid {name} argument: {description}")]
Expand Down Expand Up @@ -220,7 +220,7 @@ impl ErrorKind {
/// errno values where appropriate.
pub(crate) fn errno(&self) -> Option<i32> {
match self {
ErrorKind::NotImplemented => Some(libc::ENOSYS),
ErrorKind::NotImplemented | ErrorKind::NotSupported => Some(libc::ENOSYS),
ErrorKind::InvalidArgument => Some(libc::EINVAL),
#[cfg(feature = "capi")]
ErrorKind::UnsupportedStructureData => Some(libc::E2BIG),
Expand Down
31 changes: 26 additions & 5 deletions src/procfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
flags::{OpenFlags, ResolverFlags},
resolvers::procfs::ProcfsResolver,
syscalls,
utils::{self, FdExt, MaybeOwnedFd, RawProcfsRoot},
utils::{self, kernel_version, FdExt, MaybeOwnedFd, RawProcfsRoot},
};

use std::{
Expand All @@ -76,7 +76,7 @@
path::{Path, PathBuf},
};

use once_cell::sync::OnceCell as OnceLock;
use once_cell::sync::{Lazy, OnceCell as OnceLock};
use rustix::{
fs::{self as rustix_fs, Access, AtFlags},
mount::{FsMountFlags, FsOpenFlags, MountAttrFlags, OpenTreeFlags},
Expand Down Expand Up @@ -232,9 +232,6 @@
subset_pid: bool,
}

// MSRV(1.70): Use std::sync::OnceLock.
static CACHED_PROCFS_HANDLE: OnceLock<OwnedFd> = OnceLock::new();

impl Default for ProcfsHandleBuilder {
fn default() -> Self {
Self::new()
Expand Down Expand Up @@ -341,6 +338,9 @@
/// panic as this is not a state that should be possible to reach in regular
/// program execution.
pub fn build(self) -> Result<ProcfsHandle, Error> {
// MSRV(1.70): Use std::sync::OnceLock.
static CACHED_PROCFS_HANDLE: OnceLock<OwnedFd> = OnceLock::new();

// MSRV(1.85): Use let chain here (Rust 2024).
if self.is_cache_friendly() {
// If there is already a cached filesystem available, use that.
Expand Down Expand Up @@ -806,11 +806,26 @@
/// [lwn-procfs-overmounts]: https://lwn.net/Articles/934460/
pub type ProcfsHandle = ProcfsHandleRef<'static>;

/// Indicates whether this kernel is new enough that it should have the
/// upstream-merged version of the new mount API. This is necessary because
/// testing in runc found that RHEL 8 appears to have a broken backport of the
/// new mount API that causes serious performance regressions -- as such, we
/// should simply refuse to even try to use any of the new mount APIs on pre-5.2
/// kernels.
// MSRV(1.80): Use LazyLock.
static HAS_UNBROKEN_MOUNT_API: Lazy<bool> = Lazy::new(|| kernel_version::is_gte!(5, 2));

impl ProcfsHandle {
/// Create a new `fsopen(2)`-based [`ProcfsHandle`]. This handle is safe
/// against racing attackers changing the mount table and is guaranteed to
/// have no overmounts because it is a brand-new procfs.
pub(crate) fn new_fsopen(subset: bool) -> Result<Self, Error> {
if !*HAS_UNBROKEN_MOUNT_API {
Err(ErrorImpl::NotSupported {
feature: "fsopen".into(),
})?

Check warning on line 826 in src/procfs.rs

View check run for this annotation

Codecov / codecov/patch

src/procfs.rs#L824-L826

Added lines #L824 - L826 were not covered by tests
}

let sfd = syscalls::fsopen("proc", FsOpenFlags::FSOPEN_CLOEXEC).map_err(|err| {
ErrorImpl::RawOsError {
operation: "create procfs suberblock".into(),
Expand Down Expand Up @@ -852,6 +867,12 @@
/// guaranteed to be safe against racing attackers, and will not have
/// overmounts unless `flags` contains `OpenTreeFlags::AT_RECURSIVE`.
pub(crate) fn new_open_tree(flags: OpenTreeFlags) -> Result<Self, Error> {
if !*HAS_UNBROKEN_MOUNT_API {
Err(ErrorImpl::NotSupported {
feature: "open_tree".into(),
})?

Check warning on line 873 in src/procfs.rs

View check run for this annotation

Codecov / codecov/patch

src/procfs.rs#L871-L873

Added lines #L871 - L873 were not covered by tests
}

syscalls::open_tree(
syscalls::BADFD,
"/proc",
Expand Down
30 changes: 30 additions & 0 deletions src/syscalls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,36 @@ pub(crate) mod openat2 {

pub(crate) use openat2::{openat2, openat2_follow, OpenHow, ResolveFlags};

#[cfg(test)]
mod personality {
// musl doesn't expose UNAME26.
#[cfg(not(target_env = "musl"))]
pub(crate) const PER_UNAME26: u32 = libc::UNAME26 as _;
#[cfg(target_env = "musl")]
pub(crate) const PER_UNAME26: u32 = 0x0020000; /* <linux/personality.h> */

pub(crate) fn personality(persona: Option<u32>) -> u32 {
unsafe { libc::personality(persona.unwrap_or(0xFFFF_FFFF) as _) as _ }
}

/// Temporarily change the personality of the running thread.
///
/// The personality is reset to the original persona value (i.e., when
/// [`scoped_personality`] was first called) once the returned `impl Drop`
/// value is dropped. Note that any threads or subprocesses spawned with
/// the `scoped_personality` guard held will permanently inherit the
/// specified persona.
#[must_use]
pub(crate) fn scoped_personality(persona: u32) -> impl Drop {
scopeguard::guard(personality(Some(persona)), |old_persona| {
personality(Some(old_persona));
})
}
}

#[cfg(test)]
pub(crate) use personality::*;

#[cfg(test)]
pub(crate) fn getpid() -> rustix_process::RawPid {
rustix_process::Pid::as_raw(Some(rustix_process::getpid()))
Expand Down
2 changes: 2 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ pub(crate) use maybe_owned::*;

mod raw_procfs;
pub(crate) use raw_procfs::*;

pub(crate) mod kernel_version;
4 changes: 2 additions & 2 deletions src/utils/fd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,9 @@ impl<Fd: AsFd> FdExt for Fd {
{
let fd = self.as_fd();
let fdinfo_path = match fd.as_raw_fd() {
// MSRV(1.66): Use ..=0 (half_open_range_patterns).
// MSRV(1.66): Use ..=-1 (half_open_range_patterns).
// MSRV(1.80): Use ..0 (exclusive_range_pattern).
fd @ libc::AT_FDCWD | fd @ RawFd::MIN..=0 => Err(ErrorImpl::OsError {
fd @ libc::AT_FDCWD | fd @ RawFd::MIN..=-1 => Err(ErrorImpl::OsError {
operation: format!("get relative procfs fdinfo path for fd {fd}").into(),
source: IOError::from_raw_os_error(libc::EBADF),
})?,
Expand Down
131 changes: 95 additions & 36 deletions src/utils/fdinfo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

use crate::{
error::{Error, ErrorExt, ErrorImpl, ErrorKind},
utils::FdExt,
utils::{kernel_version, FdExt},
};

use std::{
Expand Down Expand Up @@ -100,44 +100,60 @@
{
let fd = fd.as_fd();

// Verify that the "ino" field in fdinfo matches the real inode number
// of our file descriptor. This makes attacks harder (if not near
// impossible, outside of very constrained situations):
// Verify that the "ino" field in fdinfo matches the real inode number of
// our file descriptor. This makes attacks harder (if not near impossible,
// outside of very constrained situations):
//
// * An attacker would probably struggle to always accurately guess the inode
// number of files that the process is trying to operate on. Yes, if they
// know the victim process's access patterns of procfs they could probably
// make an educated guess, but most files do not have stable inode numbers in
// procfs.
// * An attacker would probably struggle to always accurately guess the inode
// number of files that the process is trying to operate on. Yes, if they know
// the victim process's access patterns of procfs they could probably make an
// educated guess, but most files do not have stable inode numbers in procfs.
//
// * An attacker can no longer bind-mount their own fdinfo directory with just
// a buch of handles to "/proc" open (assuming the attacker is trying to
// spoof "mnt_id"), because the inode numbers won't match.
// * An attacker can no longer bind-mount their own fdinfo directory with just a
// buch of handles to "/proc" open (assuming the attacker is trying to spoof
// "mnt_id"), because the inode numbers won't match.
//
// They also can't really fake inode numbers in real procfs fdinfo
// files, so they would need to create fake fdinfo files using
// individual file arbitrary-data gadgets (like /proc/self/environ).
// However, every program only has one environment so they would need
// to create a new child process for every fd they are trying to
// attack simultaneously (and accurately update their environment
// data to avoid detection).
// They also can't really fake inode numbers in real procfs fdinfo files,
// so they would need to create fake fdinfo files using individual file
// arbitrary-data gadgets (like /proc/self/environ). However, every
// program only has one environment so they would need to create a new
// child process for every fd they are trying to attack simultaneously
// (and accurately update their environment data to avoid detection).
//
// This isn't perfect protection by any means, but it's probably the
// best we can do for very old kernels (given the constraints). At the very
// least, it makes exploitation _much_ harder than if we didn't do anything
// at all.
// This isn't perfect protection by any means, but it's probably the best we
// can do for very old kernels (given the constraints). At the very least,
// it makes exploitation _much_ harder than if we didn't do anything at all.
let actual_ino: u64 = fd.metadata().wrap("get inode number of fd")?.ino();
let fdinfo_ino: u64 =
let fdinfo_ino: Option<u64> =
match parse_and_find_fdinfo_field(rdr, "ino").map_err(|err| (err.kind(), err)) {
Ok(Some(ino)) => Ok(ino),
// "ino" *must* exist as a field -- make sure we return a
// SafetyViolation here if it is missing or an invalid value
// (InternalError), otherwise an attacker could silence this check
// by creating a "ino"-less fdinfo.
Ok(Some(ino)) => Ok(Some(ino)),
Ok(None) => {
// Unfortunately, the "ino" field in fdinfo was only added in
// Linux 5.14 (see kcommit 3845f256a8b52 ("procfs/dmabuf: add
// inode number to /proc/*/fdinfo")) and so we cannot require
// this for such old kernels.
//
// However, *for post-5.14 kernels*, "ino" *must* exist as a
// field, so make sure we return a SafetyViolation here if it is
// missing. Otherwise an attacker could silence this check by
// creating a "ino"-less fdinfo.
//
// [kcommit-3845f256a8b52]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3845f256a8b527127bfbd4ced21e93d9e89aa6d7
if kernel_version::is_gte!(5, 14) {
Err(ErrorImpl::SafetyViolation {
description: format!(
r#"fd {:?} has a fake fdinfo: missing "ino" field"#,
fd.as_raw_fd(),
)
.into(),
})?;
}
Ok(None)
}
// TODO: Should we actually match for ErrorImpl::ParseIntError here?
Ok(None) | Err((ErrorKind::InternalError, _)) => Err(ErrorImpl::SafetyViolation {
Err((ErrorKind::InternalError, _)) => Err(ErrorImpl::SafetyViolation {
description: format!(
r#"fd {:?} has a fake fdinfo: invalid or missing "ino" field"#,
r#"fd {:?} has a fake fdinfo: invalid "ino" field"#,
fd.as_raw_fd(),
)
.into(),
Expand All @@ -146,14 +162,17 @@
// Pass through any other errors.
Err((_, err)) => Err(err),
}?;
if actual_ino != fdinfo_ino {
Err(ErrorImpl::SafetyViolation {
// MSRV(1.85): Use let chain here (Rust 2024).
if let Some(fdinfo_ino) = fdinfo_ino {
if actual_ino != fdinfo_ino {
Err(ErrorImpl::SafetyViolation {
description: format!(
"fd {:?} has a fake fdinfo: wrong inode number (ino is {fdinfo_ino:X} not {actual_ino:X})",
fd.as_raw_fd()
)
.into(),
})?;
}
}

// Reset the position in the fdinfo file, and re-parse it to look for
Expand All @@ -169,7 +188,7 @@
#[cfg(test)]
mod tests {
use super::*;
use crate::error::ErrorKind;
use crate::{error::ErrorKind, syscalls};

use std::{
fmt::Debug,
Expand Down Expand Up @@ -528,9 +547,13 @@
Ok(())
}

// Make sure that a missing "ino" entry also fails.
// Make sure that a missing "ino" entry also fails on new kernels.
#[test]
fn fd_get_verify_fdinfo_no_ino() -> Result<(), Error> {
fn fd_get_verify_fdinfo_no_ino_kernel514() -> Result<(), Error> {
if !kernel_version::is_gte!(5, 14) {
return Ok(());

Check warning on line 554 in src/utils/fdinfo.rs

View check run for this annotation

Codecov / codecov/patch

src/utils/fdinfo.rs#L554

Added line #L554 was not covered by tests
}

const FAKE_FDINFO: &[u8] = indoc! {b"
foo: abcdef
mnt_id: 12345
Expand Down Expand Up @@ -565,6 +588,42 @@
Ok(())
}

// Make sure that a missing "ino" entry succeeds on old kernels (emulated).
#[test]
fn fd_get_verify_fdinfo_no_ino_oldkernel() -> Result<(), Error> {
// The UNAME26 personality lets us fake a pre-5.14 kernel version to
// kernel_version::is_gte.
let _persona_guard = syscalls::scoped_personality(syscalls::PER_UNAME26);

const FAKE_FDINFO: &[u8] = indoc! {b"
foo: abcdef
mnt_id: 12345
"};

let file = File::open(".").context("open dummy file")?;

check_fd_get_verify_fdinfo::<u64>(
&mut Cursor::new(&FAKE_FDINFO),
&file,
"mnt_id",
Ok(Some(12345)),
)
.expect(r#"get "mnt_id" from fdinfo with missing ino (pre-5.14)"#);

check_fd_get_verify_fdinfo::<u64>(&mut Cursor::new(&FAKE_FDINFO), &file, "ino", Ok(None))
.expect(r#"get "ino" from fdinfo with missing ino (pre-5.14)"#);

check_fd_get_verify_fdinfo::<String>(
&mut Cursor::new(&FAKE_FDINFO),
&file,
"non_exist",
Ok(None),
)
.expect(r#"get "non_exist" from fdinfo with missing ino (pre-5.14)"#);

Ok(())
}

// Make sure that an "ino" entry with the wrong type results in a
// SafetyViolation error, not an integer parsing error.
#[test]
Expand Down
Loading
Loading