From 05c4dbcd5feeb78cd719079a32f685bbcbc50412 Mon Sep 17 00:00:00 2001 From: irabbani Date: Thu, 24 Jul 2025 20:24:04 +0000 Subject: [PATCH 01/20] [core] (cgroups 1/n) Adding a sys/fs filesystem driver to perform cgroup operations. Signed-off-by: irabbani --- src/ray/common/cgroup2/BUILD | 28 + .../common/cgroup2/cgroup_driver_interface.h | 196 +++++ src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 420 ++++++++++ src/ray/common/cgroup2/sysfs_cgroup_driver.h | 190 +++++ src/ray/common/cgroup2/test/BUILD | 49 ++ .../common/cgroup2/test/cgroup_test_utils.cc | 231 ++++++ .../common/cgroup2/test/cgroup_test_utils.h | 123 +++ .../sysfs_cgroup_driver_integration_test.cc | 773 ++++++++++++++++++ .../cgroup2/test/sysfs_cgroup_driver_test.cc | 118 +++ src/ray/common/status.cc | 1 + src/ray/common/status.h | 10 + src/ray/common/status_or.h | 6 + 12 files changed, 2145 insertions(+) create mode 100644 src/ray/common/cgroup2/BUILD create mode 100644 src/ray/common/cgroup2/cgroup_driver_interface.h create mode 100644 src/ray/common/cgroup2/sysfs_cgroup_driver.cc create mode 100644 src/ray/common/cgroup2/sysfs_cgroup_driver.h create mode 100644 src/ray/common/cgroup2/test/BUILD create mode 100644 src/ray/common/cgroup2/test/cgroup_test_utils.cc create mode 100644 src/ray/common/cgroup2/test/cgroup_test_utils.h create mode 100644 src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc create mode 100644 src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc diff --git a/src/ray/common/cgroup2/BUILD b/src/ray/common/cgroup2/BUILD new file mode 100644 index 000000000000..758d5579ace2 --- /dev/null +++ b/src/ray/common/cgroup2/BUILD @@ -0,0 +1,28 @@ +load("//bazel:ray.bzl", "ray_cc_library") + +ray_cc_library( + name = "cgroup_driver_interface", + hdrs = [ + "cgroup_driver_interface.h", + ], + deps = [ + "//src/ray/common:status", + "//src/ray/common:status_or", + ], +) + +ray_cc_library( + name = "sysfs_cgroup_driver", + srcs = ["sysfs_cgroup_driver.cc"], + hdrs = [ + "sysfs_cgroup_driver.h", + ], + deps = [ + ":cgroup_driver_interface", + "//src/ray/common:status", + "//src/ray/common:status_or", + "//src/ray/util:logging", + "//src/ray/util:process", + "@com_google_absl//absl/strings:str_format", + ], +) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h new file mode 100644 index 000000000000..3178a8c27008 --- /dev/null +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -0,0 +1,196 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include + +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +namespace ray { + +/** + A utility interface that allows the caller to perform cgroup operations. + TODO(irabbani): I need to figure out the following + 1. What is the appropriate level of documentation here? This header file is what's + important by consumers of the API and it needs to provide the correct docstrings + and API. + 2. Revisiting this API to make it sure it integrates with a possible systemd + driver implementation (meaning it cannot have any directory based info or any dbus + implementation). + */ +class CgroupDriverInterface { + public: + virtual ~CgroupDriverInterface() = default; + + /** + Checks to see if only cgroupv2 is enabled (known as unified mode) on the system. + If cgroupv2 is not enabled, or enabled along with cgroupv1, returns Invalid + with the appropriate error message. + + @see systemd's documentation for more information about unified mode: + https://github.com/systemd/systemd/blob/main/docs/CGROUP_DELEGATION.md#hierarchy-and-controller-support + + @see K8S documentation on how to enable cgroupv2 and check if it's enabled correctly: + https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support + + @return OK if successful, otherwise Invalid. + */ + virtual Status CheckCgroupv2Enabled() = 0; + + /** + Checks that the cgroup is valid. See return values for details of which + invariants are checked. + + @param cgroup the absolute path of the cgroup. + + @return OK if no errors are encounted. Otherwise, one of the following errors + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + InvalidArgument if the cgroup is not using cgroupv2. + */ + virtual Status CheckCgroup(const std::string &cgroup) = 0; + + /** + Creates a new cgroup at the specified path. + Expects all cgroups on the path from root -> the new cgroup to already exist. + Expects the user to have read, write, and execute privileges to parent cgroup. + + @param cgroup is an absolute path to the cgroup + + @return OK if no errors are encounted. Otherwise, one of the following errors + NotFound if an ancestor cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + AlreadyExists if the cgroup already exists. + */ + virtual Status CreateCgroup(const std::string &cgroup) = 0; + + /** + Move all processes from one cgroup to another. The process must have read, write, and + execute permissions for both cgroups and their lowest common ancestor. + + @see The relevant section of the cgroup documentation for more details: + https://docs.kernel.org/admin-guide/cgroup-v2.html#delegation-containment + + @param from the absolute path of the cgroup to migrate processes out of. + @param to the absolute path of the cgroup to migrate processes into. + + @return OK if no errors are encounted. Otherwise, one of the following errors + NotFound if to or from don't exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + Invalid if any errors occur while reading from and writing to cgroups. + */ + virtual Status MoveProcesses(const std::string &from, const std::string &to) = 0; + + /** + Enables an available controller in a cgroup. A controller can be enabled if the + 1) controller is enabled in the parent of the cgroup. + 2) cgroup has no children i.e. it's a leaf node. + + @param cgroup is an absolute path to the cgroup. + @param controller is the name of the controller (e.g. "cpu" and not "+cpu") + + @see No Internal Process Constraint for more details: + https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint + + @return OK if successful, otherwise one of the following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions for + the cgroup. + InvalidArgument if the controller is not available or if cgroup is not a cgroupv2. + Invalid for all other failures. + */ + virtual Status EnableController(const std::string &cgroup, + const std::string &controller) = 0; + + /** + Disables an enabled controller in a cgroup. A controller can be disabled if the + controller is not enabled on a child cgroup. + + @param cgroup is an absolute path to the cgroup. + @param controller is the name of the controller (e.g. "cpu" and not "-cpu") + + @return OK if successful, otherwise one of the following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions for + the cgroup. + InvalidArgument if the controller is not enabled or if cgroup is not a cgroupv2. + Invalid for all other failures. + */ + virtual Status DisableController(const std::string &cgroup, + const std::string &controller) = 0; + /** + Adds a resource constraint to the cgroup. To add a constraint + 1) the cgroup must have the relevant controller enabled e.g. memory.min cannot be + enabled if the memory controller is not enabled. + 2) the constraint must be supported in Ray (@see supported_constraints_). + 3) the constraint value must be in the correct range (@see supported_constriants_). + + @param cgroup is an absolute path to the cgroup. + @param constraint the name of the constraint. + @param value the value of the constraint. + + @return OK if successful, otherwise one of the following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions for + the cgroup. + InvalidArgument if the cgroup is not valid or constraint is not supported or the value + not correct. + */ + virtual Status AddConstraint(const std::string &cgroup, + const std::string &constraint, + const std::string &value) = 0; + /** + Returns a list of controllers that can be enabled on the given cgroup based on + what is enabled on the parent cgroup. + + @param cgroup absolute path of the cgroup. + + @returns OK with a set of controllers if successful, otherwise one of following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute + permissions. + InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + */ + virtual StatusOr> GetAvailableControllers( + const std::string &cgroup) = 0; + + /** + Returns a list of controllers enabled on the cgroup. + + @param cgroup absolute path of the cgroup. + + @returns OK with a set of controllers if successful, otherwise one of following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute + permissions. + InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + */ + virtual StatusOr> GetEnabledControllers( + const std::string &cgroup) = 0; + + struct Constraint { + std::pair range; + std::string controller; + }; + + protected: + const std::unordered_map supported_constraints_ = { + {"cpu.weight", {{1, 10000}, "cpu"}}, + {"memory.min", {{0, std::numeric_limits::max()}, "memory"}}, + }; + const std::unordered_set supported_controllers_ = {"cpu", "memory"}; +}; +} // namespace ray diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc new file mode 100644 index 000000000000..40b1912823d6 --- /dev/null +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -0,0 +1,420 @@ +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +namespace ray { +Status SysFsCgroupDriver::CheckCgroupv2Enabled() { + FILE *fp = setmntent(mount_file_path_.c_str(), "r"); + + if (!fp) { + return Status::Invalid( + absl::StrFormat("Failed to open mount file at %s. Could not verify that " + "cgroupv2 was mounted correctly. \n%s", + mount_file_path_, + strerror(errno))); + } + + bool found_cgroupv1 = false; + bool found_cgroupv2 = false; + + struct mntent *mnt; + while ((mnt = getmntent(fp)) != nullptr) { + found_cgroupv1 = found_cgroupv1 || strcmp(mnt->mnt_fsname, "cgroup") == 0; + found_cgroupv2 = found_cgroupv2 || strcmp(mnt->mnt_fsname, "cgroup2") == 0; + } + + // After parsing the mount file, the file should be at the EOF position. + // If it's not, getmntent encountered an error. + if (!feof(fp) || !endmntent(fp)) { + return Status::Invalid( + absl::StrFormat("Failed to parse mount file at %s. Could not verify that " + "cgroupv2 was mounted correctly.", + mount_file_path_)); + } + + // TODO(irabbani): provide instructions for how to mount correctly and link to the + // correct docs. + if (found_cgroupv1 && found_cgroupv2) { + return Status::Invalid("Cgroupv1 and cgroupv2 are both mounted. Unmount cgroupv1."); + } else if (found_cgroupv1 && !found_cgroupv2) { + return Status::Invalid( + "Cgroupv1 is mounted and cgroupv2 is not mounted. " + "Unmount cgroupv1 and mount cgroupv2."); + } else if (!found_cgroupv2) { + return Status::Invalid("Cgroupv2 is not mounted. Mount cgroupv2."); + } + return Status::OK(); +} + +Status SysFsCgroupDriver::CheckCgroup(const std::string &cgroup_path) { + struct statfs fs_stats {}; + if (statfs(cgroup_path.c_str(), &fs_stats) != 0) { + if (errno == ENOENT) { + return Status::NotFound( + absl::StrFormat("Cgroup at %s does not exist.", cgroup_path)); + } + if (errno == EACCES) { + return Status::PermissionDenied( + absl::StrFormat("The current user does not have read, write, and execute " + "permissions for the directory at path %s.\n%s", + cgroup_path, + strerror(errno))); + } + return Status::InvalidArgument( + absl::StrFormat("Failed to stat cgroup directory at path %s. because of %s", + cgroup_path, + strerror(errno))); + } + if (fs_stats.f_type != CGROUP2_SUPER_MAGIC) { + return Status::InvalidArgument( + absl::StrFormat("Directory at path %s is not of type cgroupv2. " + "For instructions to mount cgroupv2 correctly, see:\n" + "https://kubernetes.io/docs/concepts/architecture/cgroups/" + "#linux-distribution-cgroup-v2-support.", + cgroup_path)); + } + + // Note: the process needs execute permissions for the cgroup directory + // to traverse the filesystem. + if (access(cgroup_path.c_str(), R_OK | W_OK | X_OK) == -1) { + return Status::PermissionDenied( + absl::StrFormat("The current user does not have read, write, and execute " + "permissions for the directory at path %s.\n%s", + cgroup_path, + strerror(errno))); + } + + return Status::OK(); +} + +Status SysFsCgroupDriver::CreateCgroup(const std::string &cgroup_path) { + if (mkdir(cgroup_path.c_str(), S_IRWXU) == -1) { + if (errno == ENOENT) { + return Status::NotFound( + absl::StrFormat("Failed to create cgroup at path %s with permissions %#o. " + "The parent cgroup does not exist.\n" + "Error: %s.", + cgroup_path, + S_IRWXU, + strerror(errno))); + } + if (errno == EACCES) { + return Status::PermissionDenied(absl::StrFormat( + "Failed to create cgroup at path %s with permissions %#o. " + "The current user does not have read, write, execute permissions " + "for the parent cgroup.\n" + "Error: %s.", + cgroup_path, + S_IRWXU, + strerror(errno))); + } + if (errno == EEXIST) { + return Status::AlreadyExists( + absl::StrFormat("Failed to create cgroup at path %s with permissions %#o. " + "The cgroup already exists.\n" + "Error: %s.", + cgroup_path, + S_IRWXU, + strerror(errno))); + } + return Status::InvalidArgument( + absl::StrFormat("Failed to create cgroup at path %s with permissions %#o.\n" + "Error: %s.", + cgroup_path, + S_IRWXU, + strerror(errno))); + } + return Status::OK(); +} + +StatusOr> SysFsCgroupDriver::GetAvailableControllers( + const std::string &cgroup_dir) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_dir)); + + std::string controller_file_path = cgroup_dir + + std::filesystem::path::preferred_separator + + std::string(kCgroupControllersFilename); + return ReadControllerFile(controller_file_path); +} + +StatusOr> SysFsCgroupDriver::GetEnabledControllers( + const std::string &cgroup_dir) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_dir)); + + std::string controller_file_path = cgroup_dir + + std::filesystem::path::preferred_separator + + std::string(kCgroupSubtreeControlFilename); + return ReadControllerFile(controller_file_path); +} + +Status SysFsCgroupDriver::MoveProcesses(const std::string &from, const std::string &to) { + RAY_RETURN_NOT_OK(CheckCgroup(from)); + RAY_RETURN_NOT_OK(CheckCgroup(to)); + std::filesystem::path from_procs_file_path = + from / std::filesystem::path(kCgroupProcsFilename); + std::filesystem::path to_procs_file_path = + to / std::filesystem::path(kCgroupProcsFilename); + std::ifstream in_file(from_procs_file_path); + std::ofstream out_file(to_procs_file_path, std::ios::ate); + if (!in_file.is_open()) { + return Status::Invalid(absl::StrFormat("Could not open cgroup procs file at path %s.", + from_procs_file_path)); + } + if (!out_file.is_open()) { + return Status::Invalid( + absl::StrFormat("Could not open cgroup procs file %s", to_procs_file_path)); + } + pid_t pid = 0; + while (in_file >> pid) { + if (in_file.fail()) { + return Status::Invalid(absl::StrFormat( + "Could not read pid from cgroup procs file %s", from_procs_file_path)); + } + out_file << pid; + out_file.flush(); + if (out_file.fail()) { + return Status::Invalid(absl::StrFormat( + "Could not write pid to cgroup procs file %s", to_procs_file_path)); + } + } + return Status::OK(); +} + +Status SysFsCgroupDriver::EnableController(const std::string &cgroup_path, + const std::string &controller) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path)); + + StatusOr> available_controllers_s = + GetAvailableControllers(cgroup_path); + + RAY_RETURN_NOT_OK(available_controllers_s.status()); + auto available_controllers = available_controllers_s.value(); + + if (available_controllers.find(controller) == available_controllers.end()) { + std::string available_controllers_str("["); + std::for_each(available_controllers.begin(), + available_controllers.end(), + [&available_controllers_str](const std::string &ctrl) { + available_controllers_str.append(ctrl); + }); + available_controllers_str.append("]"); + return Status::InvalidArgument(absl::StrFormat( + "Controller %s is not available for cgroup at path %s.\n" + "Current available controllers are %s. " + "To enable a controller in a cgroup X, all cgroups in the path from " + "the root cgroup to X must have the controller enabled.", + controller, + cgroup_path, + available_controllers_str)); + } + + std::filesystem::path enabled_ctrls_file = + std::filesystem::path(cgroup_path + std::filesystem::path::preferred_separator + + std::string(kCgroupSubtreeControlFilename)); + std::ofstream out_file(enabled_ctrls_file, std::ios::ate); + if (!out_file.is_open()) { + return Status::Invalid(absl::StrFormat("Could not open cgroup controllers file at %s", + enabled_ctrls_file)); + } + out_file << ("+" + controller); + out_file.flush(); + if (out_file.fail()) { + return Status::Invalid(absl::StrFormat( + "Could not open write to cgroup controllers file %s", enabled_ctrls_file)); + } + return Status::OK(); +} + +Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path, + const std::string &controller) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path)); + std::string controller_file_path = cgroup_path + + std::filesystem::path::preferred_separator + + std::string(kCgroupSubtreeControlFilename); + + StatusOr> enabled_controllers_s = + ReadControllerFile(controller_file_path); + + RAY_RETURN_NOT_OK(enabled_controllers_s.status()); + + auto enabled_controllers = enabled_controllers_s.value(); + + if (enabled_controllers.find(controller) == enabled_controllers.end()) { + std::string enabled_controllers_str("["); + std::for_each(enabled_controllers.begin(), + enabled_controllers.end(), + [&enabled_controllers_str](const std::string &ctrl) { + enabled_controllers_str.append(ctrl); + }); + enabled_controllers_str.append("]"); + return Status::InvalidArgument( + absl::StrFormat("Controller %s is not enabled for cgroup at path %s.\n" + "Current enabled controllers are %s. ", + controller, + cgroup_path, + enabled_controllers_str)); + } + + std::ofstream out_file(controller_file_path, std::ios::ate); + if (!out_file.is_open()) { + return Status::Invalid(absl::StrFormat("Could not open cgroup controllers file at %s", + controller_file_path)); + } + out_file << ("-" + controller); + out_file.flush(); + if (!out_file.good()) { + return Status::Invalid(absl::StrFormat( + "Could not open write to cgroup controllers file %s", controller_file_path)); + } + return Status::OK(); +} + +Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, + const std::string &constraint, + const std::string &constraint_value) { + RAY_RETURN_NOT_OK(CheckCgroup(cgroup)); + auto constraint_it = supported_constraints_.find(constraint); + // Return InvalidArgument if constraint is not enabled. + if (constraint_it == supported_constraints_.end()) { + std::string supported_constraint_names("["); + for (auto it = supported_constraints_.begin(); it != supported_constraints_.end(); + ++it) { + supported_constraint_names.append(it->first); + if (std::next(it) != supported_constraints_.end()) { + supported_constraint_names.append(", "); + } + } + supported_constraint_names.append("]"); + return Status::InvalidArgument(absl::StrFormat( + "Failed to apply constraint %s to cgroup %s. Ray only supports %s", + constraint, + cgroup, + supported_constraint_names)); + } + + // Check if the constraint value is out of range and therefore invalid. + auto [low, high] = constraint_it->second.range; + int value = std::stoi(constraint_value); + if (value < low || value > high) { + return Status::InvalidArgument(absl::StrFormat( + "Failed to apply constraint %s=%s to cgroup %s. %s can only have values " + "in the range[%i, %i].", + constraint, + constraint_value, + cgroup, + constraint, + low, + high)); + } + + // Check if the required controller for the constraint is enabled. + const std::string &controller = constraint_it->second.controller; + StatusOr> available_controllers_s = + GetEnabledControllers(cgroup); + RAY_RETURN_NOT_OK(available_controllers_s.status()); + const auto &controllers = available_controllers_s.value(); + if (controllers.find(controller) == controllers.end()) { + return Status::InvalidArgument(absl::StrFormat( + "Failed to apply %s to cgroup %s. To use %s, enable the %s controller.", + constraint, + cgroup, + constraint, + controller)); + } + + // Try to apply the constraint and propagate the appropriate failure error. + std::string file_path = + cgroup + std::filesystem::path::preferred_separator + constraint; + + int fd = open(file_path.c_str(), O_RDWR); + + // TODO(irabbani): If meaningful, the errno can be used to disambiguate different + // errors if they're appropriate. + if (fd == -1) { + return Status::InvalidArgument( + absl::StrFormat("Failed to apply %s=%s to cgroup %s.\n" + "Error: %s", + constraint, + constraint_value, + cgroup, + strerror(errno))); + } + + ssize_t bytes_written = write(fd, constraint_value.c_str(), constraint_value.size()); + + if (bytes_written != static_cast(constraint_value.size())) { + return Status::InvalidArgument( + absl::StrFormat("Failed to apply %s=%s to cgroup %s.\n" + "Error: %s", + constraint, + constraint_value, + cgroup, + strerror(errno))); + } + close(fd); + return Status::OK(); +} + +StatusOr> SysFsCgroupDriver::ReadControllerFile( + const std::string &controller_file_path) { + std::ifstream controllers_file(controller_file_path); + + if (!controllers_file.is_open()) { + return Status::InvalidArgument(absl::StrFormat( + "Failed to open controllers file at path %s.", controller_file_path)); + } + + std::unordered_set controllers; + + if (controllers_file.peek() == EOF) { + return StatusOr>(controllers); + } + + std::string line; + std::getline(controllers_file, line); + + if (!controllers_file.good()) { + return Status::InvalidArgument( + absl::StrFormat("Failed to parse controllers file %s.", controller_file_path)); + } + + std::istringstream input_ss(line); + std::string controller; + + while (input_ss >> controller) { + controllers.emplace(std::move(controller)); + } + + std::getline(controllers_file, line); + + // A well-formed controllers file should have just one line. + if (!controllers_file.eof()) { + return Status::InvalidArgument( + absl::StrFormat("Failed to parse controllers file %s.", controller_file_path)); + } + + return StatusOr>(controllers); +} + +} // namespace ray diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h new file mode 100644 index 000000000000..39e1d1e90702 --- /dev/null +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -0,0 +1,190 @@ +#include +#include + +#include +#include + +#include "ray/common/cgroup2/cgroup_driver_interface.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +// Used to identify if a filesystem is mounted using cgroupv2. +// See: https://docs.kernel.org/admin-guide/cgroup-v2.html#mounting +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC = 0x63677270 +#endif + +namespace ray { + +/** + * Peforms cgroupv2 operations using the pseudo filesystem documented + * here https://docs.kernel.org/admin-guide/cgroup-v2.html#interface-files. + * + * Usage: + * std::unique_ptr driver = + * std::make_unique(); + * if (driver->CheckCgroupv2Enabled.ok()) { + * // perform operations + * } + */ +class SysFsCgroupDriver : public CgroupDriverInterface { + public: + /** + * MOUNTED is defined in mntent.h (and typically refers to /etc/mtab) + * @see https://www.gnu.org/software/libc/manual/2.24/html_node/Mount-Information.html + * + * @param mount_file_path only used for testing. + */ + explicit SysFsCgroupDriver(std::string mount_file_path = MOUNTED) + : mount_file_path_(std::move(mount_file_path)) {} + + ~SysFsCgroupDriver() override = default; + SysFsCgroupDriver(const SysFsCgroupDriver &other) = delete; + SysFsCgroupDriver(const SysFsCgroupDriver &&other) = delete; + SysFsCgroupDriver &operator=(const SysFsCgroupDriver &other) = delete; + SysFsCgroupDriver &operator=(const SysFsCgroupDriver &&other) = delete; + + // The recommended way to mount cgroupv2 only and disable cgroupv1. This will + // prevent controllers from being migrated between the two. This is what systemd + // does and recommends. See the following for more information: + // https://github.com/systemd/systemd/blob/main/docs/CGROUP_DELEGATION.md#hierarchy-and-controller-support + // https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support + + /** + The recommended way to mount cgroupv2 is with cgroupv1 disabled. This prevents + cgroup controllers from being migrated between the two modes. This follows + the recommendation from systemd and K8S. + + Parses the mount file at /etc/mstab and returns Ok if only cgroupv2 is + mounted. + + Example Mountfile that is correct: + /dev/root / ext4 rw,relatime,discard + /dev/nvme2n1 /home/ubuntu ext4 rw,noatime,discard + cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate + + Example Mountfile that is incorrect (both v2 and v1 are mounted): + /dev/root / ext4 rw,relatime,discard + /dev/nvme2n1 /home/ubuntu ext4 rw,noatime,discard + cgroup /sys/fs/cgroup cgroup rw,nosuid,nodev,noexec,relatime,nsdelegate + cgroup2 /sys/fs/cgroup/unified/ cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate + + @return OK if no errors are encounted, otherwise Invalid. + */ + Status CheckCgroupv2Enabled() override; + + /** + Checks to see if the cgroup_path is mounted in the cgroupv2 filesystem + and that the current process has read, write, and execute permissions for + the directory. + + @param cgroup_path the path of a cgroup directory (e.g. /sys/fs/cgroup/ray) + + @return OK if no errors are encounted. Otherwise, one of the following errors + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + InvalidArgument if the cgroup is not using cgroupv2. + + @see The kernel documentation for CGROUP2_SUPER_MAGIC + https://www.kernel.org/doc/html/v5.4/admin-guide/cgroup-v2.html#mounting + */ + Status CheckCgroup(const std::string &cgroup_path) override; + + /** + To create a cgroup using the cgroupv2 vfs, the current user needs to read, write, and + execute permissions for the parent cgroup. This can be achieved through cgroup + delegation. + + @see The relevant manpage section on delegation for more details + https://docs.kernel.org/admin-guide/cgroup-v2.html#delegation + + @param cgroup_path the absolute path of the cgroup directory to create. + + @return OK if no errors are encounted. Otherwise, one of the following errors + NotFound if an ancestor cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + AlreadyExists if the cgroup already exists. + */ + Status CreateCgroup(const std::string &cgroup_path) override; + + /** + + Checks to see if cgroup_dir is a valid cgroup, @see SysFsCgroupDriver::CheckCgroup, + and returns an appropriate error if not. + + Parses a cgroup.controllers file which has a space separated list of all controllers + available to the cgroup. + + @see For details of the cgroup.controllers file + https://docs.kernel.org/admin-guide/cgroup-v2.html#enabling-and-disabling. + + @param cgroup_path absolute path of the cgroup. + @returns OK with a set of controllers if successful, otherwise one of following + NotFound if the cgroup does not exist. + PermissionDenied if current user doesn't have read, write, and execute permissions. + InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + */ + StatusOr> GetAvailableControllers( + const std::string &cgroup_dir) override; + + // + StatusOr> GetEnabledControllers( + const std::string &cgroup_dir) override; + + // to - + // from - + // Fails if to doesn't exist and from doesn't exist + /** + Reads the cgroup.procs of from and writes them out to the given file. + The cgroup.procs file is newline seperated. The current user must have + read-write permissions to both cgroup.procs file as well as the common ancestor + of the source and destination cgroups. + + @see The cgroup.procs section for more information + https://docs.kernel.org/admin-guide/cgroup-v2.html#core-interface-files + @return InvalidArgument if process files could not be opened, read from, or written to + correctly, ok otherwise. + */ + Status MoveProcesses(const std::string &from, const std::string &to) override; + + /** + The cgroup.subtree_control file has the list of all currently enabled + controllers for the cgroup. There are two important caveats: + + 1. The no internal process constraint + 2. The controller needs to be enabled through the entiree path to thee cgroup + constraint. + + @param cgroup_path + @param controller + */ + Status EnableController(const std::string &cgroup_path, + const std::string &controller) override; + + Status DisableController(const std::string &cgroup_path, + const std::string &controller) override; + + /** + @returns + InvalidArgument if the cgroup is not valid, or if the constraint file doesn't + exist. + InvalidArgument if the constraint is not supported. + + */ + Status AddConstraint(const std::string &cgroup, + const std::string &constraint, + const std::string &constraint_value) override; + + private: + // cgroup.subtree_control and cgroup.controllers both have the same format. + // assumes caller checks the validity of the cgroup + StatusOr> ReadControllerFile( + const std::string &controller_file_path); + + std::string mount_file_path_; + static constexpr std::string_view kCgroupProcsFilename = "cgroup.procs"; + static constexpr std::string_view kCgroupSubtreeControlFilename = + "cgroup.subtree_control"; + static constexpr std::string_view kCgroupControllersFilename = "cgroup.controllers"; +}; +} // namespace ray diff --git a/src/ray/common/cgroup2/test/BUILD b/src/ray/common/cgroup2/test/BUILD new file mode 100644 index 000000000000..276816811613 --- /dev/null +++ b/src/ray/common/cgroup2/test/BUILD @@ -0,0 +1,49 @@ +load("//bazel:ray.bzl", "ray_cc_library", "ray_cc_test") + +ray_cc_library( + name = "cgroup_test_utils", + srcs = ["cgroup_test_utils.cc"], + hdrs = ["cgroup_test_utils.h"], + deps = [ + "//src/ray/common:status", + "//src/ray/common:status_or", + "@com_google_absl//absl/strings:str_format", + ], +) + +ray_cc_test( + name = "sysfs_cgroup_driver_integration_test", + srcs = ["sysfs_cgroup_driver_integration_test.cc"], + tags = [ + "cgroup", + "exclusive", + "no_windows", + "team:core", + ], + deps = [ + ":cgroup_test_utils", + "//src/ray/common:status", + "//src/ray/common:status_or", + "//src/ray/common/cgroup2:sysfs_cgroup_driver", + "//src/ray/common/test:testing", + "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "sysfs_cgroup_driver_test", + srcs = ["sysfs_cgroup_driver_test.cc"], + tags = [ + "team:core", + ], + deps = [ + ":cgroup_test_utils", + "//src/ray/common:status", + "//src/ray/common:status_or", + "//src/ray/common/cgroup2:sysfs_cgroup_driver", + "//src/ray/common/test:testing", + "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc new file mode 100644 index 000000000000..13b0345ce92d --- /dev/null +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.cc @@ -0,0 +1,231 @@ +#include "ray/common/cgroup2/test/cgroup_test_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" +#include "ray/util/logging.h" + +std::string GenerateRandomFilename(size_t len) { + std::string output; + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution dist(0, 61); + output.reserve(len); + for (size_t i = 0; i < len; ++i) { + int randomChar = dist(mt); + if (randomChar < 26) { + output.push_back('a' + randomChar); + } else if (randomChar < 26 + 26) { + output.push_back('A' + randomChar - 26); + } else { + output.push_back('0' + randomChar - 52); + } + } + return output; +} + +ray::StatusOr> TempCgroupDirectory::Create( + const std::string &base_path, mode_t mode) { + std::string name = GenerateRandomFilename(kCgroupNameLength); + std::string path = base_path + std::filesystem::path::preferred_separator + name; + if (mkdir(path.c_str(), mode) == -1) { + return ray::Status::IOError( + absl::StrFormat("Failed to create cgroup directory at path %s.\n" + "Cgroup tests expect tmpfs and cgroupv2 to be mounted " + "and only run on Linux.\n" + "Error: %s", + path, + strerror(errno))); + } + auto output = std::make_unique(std::move(name), std::move(path)); + return ray::StatusOr>(std::move(output)); +} + +TempCgroupDirectory::~TempCgroupDirectory() noexcept(false) { + if (rmdir(path_.c_str()) != 0) { + throw std::runtime_error( + absl::StrFormat("Failed to delete a cgroup directory at %s. Please manually " + "delete it with rmdir. \n%s", + path_, + strerror(errno))); + } +} + +ray::StatusOr> TempDirectory::Create() { + std::string path = "/tmp/XXXXXX"; + char *ret = mkdtemp(path.data()); + if (ret == nullptr) { + return ray::Status::UnknownError( + absl::StrFormat("Failed to create a temp directory. " + "Cgroup tests expect tmpfs to be mounted and only run on Linux.\n" + "Error: %s", + strerror(errno))); + } + std::unique_ptr temp_dir = + std::make_unique(std::move(path)); + return ray::StatusOr>(std::move(temp_dir)); +} +TempDirectory::~TempDirectory() { std::filesystem::remove_all(path_); } + +ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms) { + if (kill(pid, SIGTERM) == -1) { + return ray::Status::Invalid( + absl::StrFormat("Failed to send SIGKILL to pid: %i.\n" + "Error: %s", + pid, + strerror(errno))); + } + struct pollfd poll_fd = { + .fd = fd, + .events = POLLIN, + }; + + int poll_status = poll(&poll_fd, 1, timeout_ms); + if (poll_status == -1) { + return ray::Status::Invalid(absl::StrFormat( + "Failed to poll process pid: %i, fd: %i. Process was not killed. Please " + "kill it manually to prevent a leak.\n" + "Error: %s", + pid, + fd, + strerror(errno))); + } + if (poll_status == 0) { + return ray::Status::Invalid(absl::StrFormat( + "Process pid: %i, fd:%i was not killed within the timeout of %ims.", + pid, + fd, + timeout_ms)); + } + siginfo_t dummy = {0}; + int wait_id_status = waitid(P_PID, static_cast(fd), &dummy, WEXITED); + if (wait_id_status == -1) { + if (errno != ECHILD) + return ray::Status::Invalid(absl::StrFormat( + "Failed to wait for process pid: %i, fd: %i. Process was not reaped, but " + "it will be reaped by init after program exits.\n" + "Error: %s", + pid, + fd, + strerror(errno))); + }; + return ray::Status::OK(); +} +ray::StatusOr> StartChildProcessInCgroup( + const std::string &cgroup_path) { + int cgroup_fd = open(cgroup_path.c_str(), O_RDONLY); + if (cgroup_fd == -1) { + return ray::Status::Invalid( + absl::StrFormat("Unable to open fd for cgroup at %s.\n" + "Error: %s", + cgroup_path, + strerror(errno))); + } + + // Will be set by clone3 if a child process is successfully created. + pid_t child_pidfd = -1; + + clone_args cl_args = {}; + cl_args.flags = CLONE_PIDFD | CLONE_INTO_CGROUP; + cl_args.cgroup = cgroup_fd; + + // Can be used both as a pid and as a fd. + cl_args.pidfd = ((__u64)((uintptr_t)(&child_pidfd))); + + int child_pid = -1; + + if ((child_pid = syscall(__NR_clone3, &cl_args, sizeof(struct clone_args))) == -1) { + RAY_LOG(ERROR) << "clone3 failed with error with error: " << strerror(errno); + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Unable to clone process.\n" + "Error: %s", + strerror(errno))); + } + + // Child process will execute this. + if (child_pid == 0) { + RAY_LOG(ERROR) << "Spawned child in cgroup " << cgroup_path << " with PID " + << getpid(); + pause(); + RAY_LOG(ERROR) << "Unpaused child in cgroup " << cgroup_path << " with PID " + << getpid(); + _exit(0); + } + + // Parent process will continue here. + close(cgroup_fd); + RAY_LOG(ERROR) << "Successfully cloned with parent_pid=" << getpid() + << ", child_pid=" << child_pid << ", child_pidfd=" << child_pidfd << "."; + return ray::StatusOr>({child_pid, static_cast(child_pidfd)}); +} + +TempFile::TempFile(std::string path) { + path_ = path; + fd_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); // NOLINT + if (fd_ == -1) { + throw std::runtime_error( + absl::StrFormat("Failed to create a temp file. Cgroup tests expect " + "tmpfs to be mounted " + "and only run on Linux. Error: %s", + strerror(errno))); + } + file_output_stream_ = std::ofstream(path_, std::ios::trunc); + if (!file_output_stream_.is_open()) { + throw std::runtime_error("Could not open file on tmpfs."); + } +} + +TempFile::TempFile() { + fd_ = mkstemp(path_.data()); // NOLINT + if (fd_ == -1) { + throw std::runtime_error( + "Failed to create a temp file. Cgroup tests expect tmpfs to be " + "mounted " + "and only run on Linux"); + } + if (unlink(path_.c_str()) == -1) { + close(fd_); + throw std::runtime_error("Failed to unlink temporary file."); + } + file_output_stream_ = std::ofstream(path_, std::ios::trunc); + if (!file_output_stream_.is_open()) { + throw std::runtime_error("Could not open mount file on tmpfs."); + } +} + +TempFile::~TempFile() { + close(fd_); + file_output_stream_.close(); +} + +void TempFile::AppendLine(const std::string &line) { + file_output_stream_ << line; + file_output_stream_.flush(); + if (file_output_stream_.fail()) { + throw std::runtime_error("Could not write to mount file on tmpfs"); + } +} diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.h b/src/ray/common/cgroup2/test/cgroup_test_utils.h new file mode 100644 index 000000000000..edb10279ef87 --- /dev/null +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.h @@ -0,0 +1,123 @@ +#include + +#include +#include +#include +#include + +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +static constexpr size_t kCgroupNameLength = 6; + +// Generates an ASCII string of the given length. +std::string GenerateRandomFilename(size_t len); + +/** + Creates an RAII style cgroup directory with a random name at a given + path with the given permissions. + */ +class TempCgroupDirectory { + public: + static ray::StatusOr> Create( + const std::string &base_path, mode_t mode = 0777); + + TempCgroupDirectory() = default; + explicit TempCgroupDirectory(std::string &&name, std::string &&path) + : name_(name), path_(path) {} + + TempCgroupDirectory(const TempCgroupDirectory &) = delete; + TempCgroupDirectory(TempCgroupDirectory &&) = delete; + TempCgroupDirectory &operator=(const TempCgroupDirectory &) = delete; + TempCgroupDirectory &operator=(TempCgroupDirectory &&) = delete; + + const std::string &GetPath() const { return path_; } + const std::string &GetName() const { return name_; } + + ~TempCgroupDirectory() noexcept(false); + + private: + std::string name_; + std::string path_; +}; + +/** + RAII style class for creating and destroying temporary directory for testing. + TODO(irabbani): add full documentation once complete. + */ +class TempDirectory { + public: + static ray::StatusOr> Create(); + TempDirectory(std::string &&path) : path_(path) {} + + TempDirectory(const TempDirectory &) = delete; + TempDirectory(TempDirectory &&) = delete; + TempDirectory &operator=(const TempDirectory &) = delete; + TempDirectory &operator=(TempDirectory &&) = delete; + + const std::string &GetPath() const { return path_; } + + ~TempDirectory(); + + private: + const std::string path_; +}; + +/** + RAII wrapper that creates a file that can be written to. + TODO(irabbani): Add full documentation once the API is complete. +*/ +class TempFile { + public: + explicit TempFile(std::string path); + TempFile(); + + TempFile(TempFile &other) = delete; + TempFile(TempFile &&other) = delete; + TempFile operator=(TempFile &other) = delete; + TempFile &operator=(TempFile &&other) = delete; + + ~TempFile(); + void AppendLine(const std::string &line); + + const std::string &GetPath() const { return path_; } + + private: + std::string path_ = "/tmp/XXXXXX"; + std::ofstream file_output_stream_; + int fd_; +}; + +//// Starts a process in the cgroup and returns the pid of the started process. +//// Returns -1 if there's an error. +//// Note: clone3 supports creating a process inside a cgroup instead of creating +//// and then moving. However, clone3 does not have a glibc wrapper and +//// must be called directly using syscall syscall (see man 2 syscall). +//// This function needs linux kernel >= 5.7 to use the CLONE_INTO_CGROUP flag. +//// It is the caller's responsibility to terminate the child process and +//// wait for the pid. Use TerminateChildProcessAndWaitForTimeout. +//// Returns a process_fd. See the CLONE_PIDFD section in "man 2 clone" +ray::StatusOr> StartChildProcessInCgroup( + const std::string &cgroup_path); + +/** + @param process_fd can be used as a fd and as a pid. It can be created using + clone or pidfd_open. + @param timeout_ms time used to poll for the process_fd to detect the process + has been killed. + @return ok if successfully terminated the process and reaped it. Invalid otherwise + with the correct error message. + */ +ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms); + +std::ostream &operator<<(std::ostream &os, const TempCgroupDirectory &temp_cgroup_dir) { + return os << temp_cgroup_dir.GetPath(); +} + +std::ostream &operator<<(std::ostream &os, + const std::unique_ptr &ptr) { + if (ptr == nullptr) { + return os << ""; + } + return os << *ptr; +} diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc new file mode 100644 index 000000000000..418bcde4914f --- /dev/null +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -0,0 +1,773 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" +#include "ray/common/cgroup2/test/cgroup_test_utils.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +/** + NOTE: Read these instructions before running these tests locally. + These tests are only supported on linux with cgroupv2 enabled in unified mode. + + See the following documentation for how to enable cgroupv2 properly: + https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support + + When running these tests, the process that starts the tests (usually your terminal) + needs to be inside a cgroup hierarchy that it has ownership of. Run the following + bash commands to make that happen: + + # Create the cgroup heirarchy (these need to match base_cgroup_path_ and + # processes_cgroup_path_ which are defined below) + sudo mkdir /sys/fs/cgroup/testing + sudo mkdir /sys/fs/cgroup/active + + # Give ownership and permissions to the current user. + USER=`whoami` + sudo chown -R USER:USER /sys/fs/cgroup/testing + sudo chmod -R u+rwx /sys/fs/cgroup/testing + + # Move the current process into a leaf cgroup within the hierarchy. + echo $$ | sudo tee -a /sys/fs/cgroup/testing/active/cgroup.procs + + # Enable cpu and memory controllers on the parent cgroup + echo "+cpu" > /sys/fs/cgroup/testing/cgroup.subtree_control + echo "+memory" > /sys/fs/cgroup/testing/cgroup.subtree_control +*/ +// Uncomment the following line to run the tests locally. +#define RUN_LOCALLY + +// This the root of the cgroup subtree that has been delegated to the testing user. +std::unique_ptr testing_cgroup_; + +// This a sibling to the testing_cgroup_ that will store all processes that were in +// the base_cgroup. This will allow the testing_cgroup_ to have subcgroups with +// controllers enabled to satisfy the no internal process constraint. +// See the no internal process constraint section for more information: +// https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint +std::unique_ptr leaf_cgroup_; + +// base_cgroup_path_ is the root of the cgroup heirarchy that will be used for testing. +// The tests will expect that this cgroup has only the cpu and memory controllers +// enabled because they will be used for testing. +// processes_cgroup_path_ is used when running tests locally to start the test process +// in a leaf node inside the cgroup hierarchy. It's not needed in CI because the user +// inside the container has root access and can manage the entire hierarchy. This implies +// that the CI test process is always started within a cgroup hierarchy that it owns. +#ifdef RUN_LOCALLY +static inline std::string base_cgroup_path_ = "/sys/fs/cgroup/testing"; +static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup/testing/active"; +#else +static inline std::string base_cgroup_path_ = "/sys/fs/cgroup"; +static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup"; +#endif + +/** + This test suite will create the following cgroup hierarchy for the tests + base_cgroup + / \ + testing_cgroup leaf_cgroup + + It expects that root_cgroup has cpu, memory controllers enabled. + It enables the cpu, memory controllers on the testing_cgroup. + Tests should not access anything above the testing_cgroup. + The leaf_cgroup provides a leaf node clone processes into while not + violating the no internal processes constraint. + + Note: testing_cgroup and leaf_cgroup have randomly generating names + to isolate test runs from each other. + */ +class SysFsCgroupDriverIntegrationTest : public ::testing::Test { + protected: + /** + This can fail an assertion without running fully. In that case, best-effort + cleanup will happen inside the TearDownTestSuite method. + */ + static void SetUpTestSuite() { + // 1) Create the testing_cgroup_ under base_cgroup. This is the root node of the + // cgroup subtree used by the tests in this file. + auto testing_cgroup_or_status = + TempCgroupDirectory::Create(base_cgroup_path_, S_IRWXU); + ASSERT_TRUE(testing_cgroup_or_status.ok()) << testing_cgroup_or_status.ToString(); + testing_cgroup_ = std::move(testing_cgroup_or_status.value()); + + // 2) Create the leaf_cgroup_ under the base_cgroup. This will be used to store + // all the processes in the base_cgroup. + auto leaf_cgroup_or_status = TempCgroupDirectory::Create(base_cgroup_path_, S_IRWXU); + ASSERT_TRUE(leaf_cgroup_or_status.ok()) << leaf_cgroup_or_status.ToString(); + leaf_cgroup_ = std::move(leaf_cgroup_or_status.value()); + + // 3) Move all processes from the /cgroup.procs file into the + ///cgroup.procs file. This will allow the base_cgroup to have children. + std::string processes_cgroup_procs_file_path = + processes_cgroup_path_ + "/cgroup.procs"; + std::string leaf_cgroup_procs_file_path = leaf_cgroup_->GetPath() + "/cgroup.procs"; + FILE *processes_cgroup_procs = fopen(processes_cgroup_procs_file_path.c_str(), "r+"); + ASSERT_NE(processes_cgroup_procs, nullptr) + << "Failed to open processes cgroup's proc file at path " + << processes_cgroup_procs_file_path << " with error " << strerror(errno); + int leaf_cgroup_procs_fd = open(leaf_cgroup_procs_file_path.c_str(), O_RDWR); + ASSERT_NE(leaf_cgroup_procs_fd, -1) + << "Failed to open leaf cgroup's proc file at path " + << leaf_cgroup_procs_file_path << " with error " << strerror(errno); + + // According to man 5 proc, the maximum value of a pid on a 64-bit system is 2^22 + // i.e./4194304. fgets adds a null terminating character. + char read_buffer[32]; + + while (fgets(read_buffer, sizeof(read_buffer), processes_cgroup_procs)) { + ssize_t bytes_written = + write(leaf_cgroup_procs_fd, read_buffer, strlen(read_buffer)); + + // If the process exited between the call to fgets and write, write will + // return a NoSuchProcess error. + if (bytes_written == -1 && errno == ESRCH) { + continue; + } + ASSERT_EQ(bytes_written, strlen(read_buffer)) + << "Failed to write to leaf cgroups proc file at path " + << leaf_cgroup_->GetPath() << " with error " << strerror(errno); + } + ASSERT_EQ(ferror(processes_cgroup_procs), 0) + << "Failed to read pid from processes cgroups proc file at path " + << processes_cgroup_procs_file_path << " with error " << strerror(errno); + + fclose(processes_cgroup_procs); + close(leaf_cgroup_procs_fd); + + // 4) Enable cpu and memory controllers for the testing_cgroup to allow tests to + // use those controllers as necessary. + std::string test_cgroup_subtree_control_path = + testing_cgroup_->GetPath() + "/cgroup.subtree_control"; + std::unordered_set subtree_control_ops = {"+cpu", "+memory"}; + for (const auto &op : subtree_control_ops) { + int test_cgroup_subtree_control_fd = + open(test_cgroup_subtree_control_path.c_str(), O_RDWR); + ASSERT_NE(test_cgroup_subtree_control_fd, -1) + << "Failed to open test cgroup's subtree_control file at path " + << test_cgroup_subtree_control_path << " with error " << strerror(errno); + ssize_t num_written = write(test_cgroup_subtree_control_fd, op.c_str(), op.size()); + ASSERT_EQ(num_written, op.size()) + << "Failed to write to test cgroup's subtree_control file at path " + << test_cgroup_subtree_control_path << " with error " << strerror(errno); + close(test_cgroup_subtree_control_fd); + } + } + + /** + This will attempt to do best effort cleanup by reversing the effects of the + SetUpTestSuite method. Cleanup doesn't need to delete cgroup directories because of + TempCgroupDirectory's dtor will take care of it. Cleanup doesn't need to disable + controllers because a cgroup can be deleted even if it has controllers enabled. + + If cleanup fails, cgroup directories may not be cleaned up properly. However, despite + cleanup failure, tests can be rerun on the same machine/container without interference + from the previous failed attempt. + + There are a few types of leaks possible + 1) In CI, the tests run inside the container and all processes inside the container + will have been moved into the /leaf/cgroup.procs file. They may stay + there. This has no impact on subsequent runs because this cgroup should have no + constriants. + 2) In all environments, cgroups may be leaked and will have to be manually cleaned + up. Typically, a cgroup cannot be removed iff it has a running process in the + cgroup.procs file or child cgroups. + + NOTE: The teardown method is called even if assertions fail in the setup or in + tests in the suite. + */ + static void TearDownTestSuite() { + // Move all processes from the /cgroup.procs file into the + ///cgroup.procs file. This will allow the leaf_cgroup to be deleted. + std::string processes_cgroup_procs_file_path = + processes_cgroup_path_ + "/cgroup.procs"; + std::string leaf_cgroup_procs_file_path = leaf_cgroup_->GetPath() + "/cgroup.procs"; + FILE *leaf_cgroup_procs = fopen(leaf_cgroup_procs_file_path.c_str(), "r+"); + ASSERT_NE(leaf_cgroup_procs, nullptr) + << "Failed to open leaf cgroup's proc file at path " << leaf_cgroup_ + << " with error " << strerror(errno); + int processes_cgroup_procs_fd = + open(processes_cgroup_procs_file_path.c_str(), O_RDWR); + ASSERT_NE(processes_cgroup_procs_fd, -1) + << "Failed to open processes cgroup's proc file at path " + << processes_cgroup_path_ << " with error " << strerror(errno); + + // According to man 5 proc, the maximum value of a pid on a 64-bit system is 2^22 + // i.e./4194304. fgets adds a null terminating character. + char read_buffer[16]; + + while (fgets(read_buffer, sizeof(read_buffer), leaf_cgroup_procs)) { + ssize_t bytes_written = + write(processes_cgroup_procs_fd, read_buffer, strlen(read_buffer)); + ASSERT_EQ(bytes_written, strlen(read_buffer)) + << "Failed to write to processes cgroups proc file at path " + << processes_cgroup_path_ << " with error " << strerror(errno); + } + ASSERT_EQ(ferror(leaf_cgroup_procs), 0) + << "Failed to read pid from leaf cgroups proc file at path " << leaf_cgroup_ + << " with error " << strerror(errno); + + fclose(leaf_cgroup_procs); + close(processes_cgroup_procs_fd); + } +}; + +TEST_F(SysFsCgroupDriverIntegrationTest, + FixtureCreatesCgroupsAndMigratesProcessToLeafCgroup) { + ASSERT_TRUE(std::filesystem::exists(testing_cgroup_->GetPath())) + << "Test fixture did not initialize the base cgroup at " << testing_cgroup_; + ASSERT_TRUE(std::filesystem::exists(leaf_cgroup_->GetPath())) + << "Test fixture did not initialize the leaf cgroup at " << leaf_cgroup_; + std::ifstream curr_proc_group_file("/proc/" + std::to_string(getpid()) + "/cgroup"); + std::string cgroup_path; + ASSERT_NE(curr_proc_group_file.peek(), EOF) + << "The current process is not running in any cgroup. This should never happen."; + std::getline(curr_proc_group_file, cgroup_path); + ASSERT_TRUE(curr_proc_group_file.good()); + // Extract the cgroup from the path. + std::string cgroup = cgroup_path.substr(cgroup_path.find_last_of('/') + 1); + ASSERT_EQ(cgroup, leaf_cgroup_->GetName()); +} + +namespace ray { + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupv2EnabledSucceedsIfOnlyCgroupv2Mounted) { + SysFsCgroupDriver driver; + Status s = driver.CheckCgroupv2Enabled(); + ASSERT_TRUE(s.ok()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoReadPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), 0000); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoWritePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupFailsIfCgroupv2PathButNoExecPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CheckCgroupSucceedsIfCgroupv2PathAndReadWriteExecPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(cgroup_dir->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAlreadyExists) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CreateCgroup(cgroup_dir->GetPath()); + ASSERT_TRUE(s.IsAlreadyExists()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAncestorCgroupDoesNotExist) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string non_existent_path = cgroup_dir->GetPath() + + std::filesystem::path::preferred_separator + "no" + + std::filesystem::path::preferred_separator + "bueno"; + Status s = driver.CreateCgroup(non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadWritePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + CreateCgroupSucceedsIfParentExistsAndReadWriteExecPermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + std::string child_cgroup_path = + cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; + Status s = driver.CreateCgroup(child_cgroup_path); + EXPECT_TRUE(s.ok()) << s.ToString(); + Status check_status = driver.CheckCgroup(child_cgroup_path); + EXPECT_TRUE(check_status.ok()) << check_status.ToString(); + ASSERT_EQ(rmdir(child_cgroup_path.c_str()), 0) + << "Failed to cleanup test cgroup at path " << child_cgroup_path << ".\n" + << "Error: " << strerror(errno); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersFailsIfCgroupDoesNotExist) { + std::string non_existent_path = testing_cgroup_->GetPath() + + std::filesystem::path::preferred_separator + "no" + + std::filesystem::path::preferred_separator + "bueno"; + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersFailsIfReadWriteButNotExecutePermissions) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + std::unique_ptr cgroup_dir = + std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(cgroup_dir->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersSucceedsWithCPUAndMemoryControllersOnBaseCgroup) { + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(testing_cgroup_->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); + std::unordered_set controllers = std::move(s.value()); + EXPECT_TRUE(controllers.find("cpu") != controllers.end()) + << "Cgroup integration tests expect the base cgroup at " << testing_cgroup_ + << " has the cpu controller available"; +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + GetAvailableControllersSucceedsWithNoAvailableControllers) { + auto parent_cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); + std::unique_ptr parent_cgroup = + std::move(parent_cgroup_dir_or_status.value()); + auto child_cgroup_dir_or_status = + TempCgroupDirectory::Create(parent_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); + std::unique_ptr child_cgroup = + std::move(child_cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(child_cgroup->GetPath()); + EXPECT_TRUE(s.ok()) << s.ToString(); + std::unordered_set controllers = std::move(s.value()); + EXPECT_EQ(controllers.size(), 0); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfSourceDoesntExist) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + // Do not create the source cgroup + std::string non_existent_path = + ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(non_existent_path, dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfDestDoesntExist) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + // Do not create the dest cgroup. + std::string non_existent_path = + ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(source_cgroup->GetPath(), non_existent_path); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveProcessesFailsIfNotReadWriteExecPermissionsForSource) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveProcessesFailsIfNotReadWriteExecPermissionsForDest) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveProcessesFailsIfNotReadWriteExecPermissionsForAncestor) { + auto ancestor_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); + auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); + auto source_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRUSR), 0) + << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() + << "\n Error: " << strerror(errno); + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); + // Change the permissions back read, write, and execute so cgroup can be deleted. + ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRWXU), 0) + << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() + << "\n Error: " << strerror(errno); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + MoveProcessesSucceedsWithCorrectPermissionsAndValidCgroups) { + auto source_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); + auto source_cgroup = std::move(source_cgroup_or_status.value()); + auto dest_cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); + auto dest_cgroup = std::move(dest_cgroup_or_status.value()); + StatusOr> child_process_s = + StartChildProcessInCgroup(source_cgroup->GetPath()); + ASSERT_TRUE(child_process_s.ok()) << child_process_s.ToString(); + auto [child_pid, child_pidfd] = child_process_s.value(); + SysFsCgroupDriver driver; + Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + ASSERT_TRUE(s.ok()) << s.ToString(); + // Assert that the child's pid is actually in the new file. + std::string dest_cgroup_procs_file_path = dest_cgroup->GetPath() + + std::filesystem::path::preferred_separator + + "cgroup.procs"; + std::ifstream dest_cgroup_procs_file(dest_cgroup_procs_file_path); + ASSERT_TRUE(dest_cgroup_procs_file.is_open()) + << "Could not open file " << dest_cgroup_procs_file_path << "."; + std::unordered_set dest_cgroup_pids; + int pid = -1; + while (dest_cgroup_procs_file >> pid) { + ASSERT_FALSE(dest_cgroup_procs_file.fail()) + << "Unable to read pid from file " << dest_cgroup_procs_file_path; + dest_cgroup_pids.emplace(pid); + } + EXPECT_EQ(dest_cgroup_pids.size(), 1); + EXPECT_TRUE(dest_cgroup_pids.find(child_pid) != dest_cgroup_pids.end()); + Status terminate_s = + TerminateChildProcessAndWaitForTimeout(child_pid, child_pidfd, 5000); + ASSERT_TRUE(terminate_s.ok()) << terminate_s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfReadOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, EnableControllerFailsIfCgroupDoesNotExist) { + std::string non_existent_path = + testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.EnableController(non_existent_path, "memory"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableControllerFailsIfControllerNotAvailableForCgroup) { + // This will inherit controllers available because testing_cgroup_ has + // CPU and Memory controllers available. + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + auto nested_cgroup_dir_or_status = + TempCgroupDirectory::Create(cgroup_dir->GetPath(), S_IRWXU); + ASSERT_TRUE(nested_cgroup_dir_or_status.ok()) << nested_cgroup_dir_or_status.ToString(); + auto nested_cgroup_dir = std::move(nested_cgroup_dir_or_status.value()); + // Make sure that the cgroup has 0 available controllers. + // TODO(irabbani): I think it's okay to call the GetAvailableControllers function + // from here. + SysFsCgroupDriver driver; + auto available_controllers_s = + driver.GetAvailableControllers(nested_cgroup_dir->GetPath()); + ASSERT_TRUE(available_controllers_s.ok()) << available_controllers_s.ToString(); + auto available_controllers = std::move(available_controllers_s.value()); + ASSERT_EQ(available_controllers.size(), 0); + Status s = driver.EnableController(nested_cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfControllerNotEnabled) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + auto enabled_controllers_s = driver.GetEnabledControllers(cgroup_dir->GetPath()); + ASSERT_TRUE(enabled_controllers_s.ok()) << enabled_controllers_s.ToString(); + auto enabled_controllers = std::move(enabled_controllers_s.value()); + ASSERT_EQ(enabled_controllers.size(), 0); + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + DisableControllerFailsIfReadOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + DisableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { + auto cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); + auto cgroup_dir = std::move(cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfCgroupDoesNotExist) { + std::string non_existent_path = + testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.DisableController(non_existent_path, "memory"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +//// /sys/fs/cgroup/testing (2 available, enable cpu controller) +//// enable one in /sys/fs/cgroup/testing/inner (enable cpu controller fails, then +/// succeeds) / enable one in /sys/fs/cgroup/testing/inner / disable one in +////sys/fs/cgroup/testing +TEST_F(SysFsCgroupDriverIntegrationTest, + EnableAndDisableControllerSucceedWithCorrectInputAndPermissions) { + auto parent_cgroup_dir_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); + auto parent_cgroup_dir = std::move(parent_cgroup_dir_or_status.value()); + auto child_cgroup_dir_or_status = + TempCgroupDirectory::Create(parent_cgroup_dir->GetPath(), S_IRWXU); + ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); + auto child_cgroup_dir = std::move(child_cgroup_dir_or_status.value()); + SysFsCgroupDriver driver; + + // There should be no enabled controllers on the parent cgroup so enabling the memory + // controller should fail. + Status invalid_argument_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(invalid_argument_s.IsInvalidArgument()) << invalid_argument_s.ToString(); + + // Enable the controller on the parent cgroup to make it available on the child cgroup. + Status enable_parent_s = driver.EnableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(enable_parent_s.ok()) << enable_parent_s.ToString(); + + // Enable the controller on the child cgroup. + Status enable_child_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(enable_child_s.ok()) << enable_child_s.ToString(); + + // Cannot disable the controller on the parent cgroup while the child cgroup + // still has it enabled. + Status disable_parent_failure_s = + driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_FALSE(disable_parent_failure_s.ok()) << enable_parent_s.ToString(); + // Disable the controller on the child cgroup. + Status disable_child_s = driver.DisableController(child_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(disable_child_s.ok()) << disable_child_s.ToString(); + // Can now disable the controller on the parent cgroup. + Status disable_parent_success_s = + driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); + ASSERT_TRUE(disable_parent_success_s.ok()) << disable_parent_success_s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintFailsIfCgroupDoesntExist) { + std::string non_existent_path = + testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(non_existent_path, "memory.min", "1"); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfReadOnlyPermissions) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfReadWriteOnlyPermissions) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); + ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfConstraintNotSupported) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // "memory.max" is not supported. + Status s = driver.AddConstraint(cgroup->GetPath(), "memory.max", "1"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfControllerNotEnabled) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // Memory controller is not enabled. + Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} +TEST_F(SysFsCgroupDriverIntegrationTest, + AddResourceConstraintFailsIfInvalidConstraintValue) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // Enable the cpu controller first. + Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu"); + ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString(); + // cpu.weight must be between [1,10000] + Status s_too_low = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "0"); + ASSERT_TRUE(s_too_low.IsInvalidArgument()) << s_too_low.ToString(); + Status s_too_high = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "10001"); + ASSERT_TRUE(s_too_high.IsInvalidArgument()) << s_too_high.ToString(); +} + +TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) { + auto cgroup_or_status = + TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); + ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); + auto cgroup = std::move(cgroup_or_status.value()); + SysFsCgroupDriver driver; + // Enable the cpu controller first. + Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu"); + ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString(); + // cpu.weight must be between [1,10000] + Status s = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "500"); + ASSERT_TRUE(s.ok()) << s.ToString(); +} +}; // namespace ray diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc new file mode 100644 index 000000000000..608ce953c5f9 --- /dev/null +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc @@ -0,0 +1,118 @@ +#include "ray/common/cgroup2/sysfs_cgroup_driver.h" + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "ray/common/cgroup2/test/cgroup_test_utils.h" +#include "ray/common/status.h" +#include "ray/common/status_or.h" + +namespace ray { + +TEST(SysFsCgroupDriverTest, CheckCgroupv2EnabledFailsIfEmptyMountFile) { + TempFile temp_mount_file; + SysFsCgroupDriver driver(temp_mount_file.GetPath()); + Status s = driver.CheckCgroupv2Enabled(); + EXPECT_TRUE(s.IsInvalid()) << s.ToString(); +} + +TEST(SysFsCgroupDriverTest, CheckCgroupv2EnabledFailsIfMalformedMountFile) { + TempFile temp_mount_file; + temp_mount_file.AppendLine("cgroup /sys/fs/cgroup rw 0 0\n"); + temp_mount_file.AppendLine("cgroup2 /sys/fs/cgroup/unified/ rw 0 0\n"); + temp_mount_file.AppendLine("oopsie"); + SysFsCgroupDriver driver(temp_mount_file.GetPath()); + Status s = driver.CheckCgroupv2Enabled(); + EXPECT_TRUE(s.IsInvalid()) << s.ToString(); +} + +TEST(SysFsCgroupDriverTest, + CheckCgroupv2EnabledFailsIfCgroupv1MountedAndCgroupv2NotMounted) { + TempFile temp_mount_file; + temp_mount_file.AppendLine("cgroup /sys/fs/cgroup rw 0 0\n"); + SysFsCgroupDriver driver(temp_mount_file.GetPath()); + Status s = driver.CheckCgroupv2Enabled(); + ASSERT_TRUE(s.IsInvalid()) << s.ToString(); +} + +TEST(SysFsCgroupDriverTest, + CheckCgroupv2EnabledFailsIfCgroupv1MountedAndCgroupv2Mounted) { + TempFile temp_mount_file; + temp_mount_file.AppendLine("cgroup /sys/fs/cgroup rw 0 0\n"); + temp_mount_file.AppendLine("cgroup2 /sys/fs/cgroup/unified/ rw 0 0\n"); + SysFsCgroupDriver driver(temp_mount_file.GetPath()); + Status s = driver.CheckCgroupv2Enabled(); + ASSERT_TRUE(s.IsInvalid()) << s.ToString(); +} + +TEST(SysFsCgroupDriverTest, CheckCgroupv2EnabledSucceedsIfOnlyCgroupv2Mounted) { + TempFile temp_mount_file; + temp_mount_file.AppendLine("cgroup2 /sys/fs/cgroup rw 0 0\n"); + SysFsCgroupDriver driver(temp_mount_file.GetPath()); + Status s = driver.CheckCgroupv2Enabled(); + EXPECT_TRUE(s.ok()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, CheckCgroupFailsIfNotCgroupv2Path) { + // This is not a directory on the cgroupv2 vfs. + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup(temp_dir->GetPath()); + EXPECT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, CheckCgroupFailsIfCgroupDoesNotExist) { + // This is not a directory on the cgroupv2 vfs. + SysFsCgroupDriver driver; + Status s = driver.CheckCgroup("/some/path/that/doesnt/exist"); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, GetAvailableControllersFailsIfNotCgroup2Path) { + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + std::filesystem::path controller_file_path = + std::filesystem::path(temp_dir->GetPath()) / + std::filesystem::path("cgroup.controllers"); + TempFile controller_file(controller_file_path); + controller_file.AppendLine("cpuset cpu io memory hugetlb pids rdma misc"); + SysFsCgroupDriver driver; + StatusOr> s = + driver.GetAvailableControllers(temp_dir->GetPath()); + EXPECT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, EnableControllerFailsIfNotCgroupv2Path) { + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.EnableController(temp_dir->GetPath(), "cpu"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, DisableControllerFailsIfNotCgroupv2Path) { + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.DisableController(temp_dir->GetPath(), "cpu"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +TEST(SysFsCgroupDriver, AddConstraintFailsIfNotCgroupv2Path) { + auto temp_dir_or_status = TempDirectory::Create(); + ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString(); + std::unique_ptr temp_dir = std::move(temp_dir_or_status.value()); + SysFsCgroupDriver driver; + Status s = driver.AddConstraint(temp_dir->GetPath(), "memory.min", "1"); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); +} + +}; // namespace ray diff --git a/src/ray/common/status.cc b/src/ray/common/status.cc index 3e8f51261585..3500ddaf3b80 100644 --- a/src/ray/common/status.cc +++ b/src/ray/common/status.cc @@ -78,6 +78,7 @@ const absl::flat_hash_map kCodeToStr = { {StatusCode::InvalidArgument, "InvalidArgument"}, {StatusCode::ChannelError, "ChannelError"}, {StatusCode::ChannelTimeoutError, "ChannelTimeoutError"}, + {StatusCode::PermissionDenied, "PermissionDenied"}, }; const absl::flat_hash_map kStrToCode = []() { diff --git a/src/ray/common/status.h b/src/ray/common/status.h index cb91a4267693..e7224badb513 100644 --- a/src/ray/common/status.h +++ b/src/ray/common/status.h @@ -76,6 +76,8 @@ enum class StatusCode : char { IntentionalSystemExit = 14, UnexpectedSystemExit = 15, CreationTaskError = 16, + // Indicates that the caller request a resource that could not be found. A common + // example is that a request file does not exist. NotFound = 17, Disconnected = 18, SchedulingCancelled = 19, @@ -101,6 +103,9 @@ enum class StatusCode : char { ChannelError = 35, // Indicates that a read or write on a channel (a mutable plasma object) timed out. ChannelTimeoutError = 36, + // Indicates that the executing user does not have permissions to perform the + // requested operation. A common example is filesystem permissions. + PermissionDenied = 37, // If you add to this list, please also update kCodeToStr in status.cc. }; @@ -254,6 +259,10 @@ class RAY_EXPORT Status { return Status(StatusCode::ChannelTimeoutError, msg); } + static Status PermissionDenied(const std::string &msg) { + return Status(StatusCode::PermissionDenied, msg); + } + static StatusCode StringToCode(const std::string &str); // Returns true iff the status indicates success. @@ -308,6 +317,7 @@ class RAY_EXPORT Status { bool IsChannelError() const { return code() == StatusCode::ChannelError; } bool IsChannelTimeoutError() const { return code() == StatusCode::ChannelTimeoutError; } + bool IsPermissionDenied() const { return code() == StatusCode::PermissionDenied; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. diff --git a/src/ray/common/status_or.h b/src/ray/common/status_or.h index 88eb99a7a386..991b03961df0 100644 --- a/src/ray/common/status_or.h +++ b/src/ray/common/status_or.h @@ -153,6 +153,12 @@ class StatusOr { std::string StatusString() const { return status_.StatusString(); } + std::string ToString() const { return status_.ToString(); } + + bool IsNotFound() const { return code() == StatusCode::NotFound; } + bool IsInvalidArgument() const { return code() == StatusCode::InvalidArgument; } + bool IsPermissionDenied() const { return code() == StatusCode::PermissionDenied; } + // Returns a reference to the current `ray::Status` contained within the // `ray::StatusOr`. If `ray::StatusOr` contains a `T`, then this // function returns `ray::Ok()`. From 645f9a06dd0d1e8e6224f90ce7f142d266da99e9 Mon Sep 17 00:00:00 2001 From: irabbani Date: Thu, 24 Jul 2025 21:28:38 +0000 Subject: [PATCH 02/20] adding the copyright Signed-off-by: irabbani --- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 14 ++++++++++++++ src/ray/common/cgroup2/sysfs_cgroup_driver.h | 14 ++++++++++++++ src/ray/common/cgroup2/test/cgroup_test_utils.cc | 14 ++++++++++++++ src/ray/common/cgroup2/test/cgroup_test_utils.h | 14 ++++++++++++++ .../test/sysfs_cgroup_driver_integration_test.cc | 14 ++++++++++++++ .../cgroup2/test/sysfs_cgroup_driver_test.cc | 14 ++++++++++++++ 6 files changed, 84 insertions(+) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index 40b1912823d6..7bafa95c763b 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ray/common/cgroup2/sysfs_cgroup_driver.h" #include diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index 39e1d1e90702..fb3354517a5c 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc index 13b0345ce92d..08281b6a493b 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.cc +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.cc @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ray/common/cgroup2/test/cgroup_test_utils.h" #include diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.h b/src/ray/common/cgroup2/test/cgroup_test_utils.h index edb10279ef87..ef79dd183741 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.h +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.h @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 418bcde4914f..377fc5adb7b4 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include #include diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc index 608ce953c5f9..275a122e808f 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_test.cc @@ -1,3 +1,17 @@ +// Copyright 2025 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ray/common/cgroup2/sysfs_cgroup_driver.h" #include From 2bb2c5bc384ae588ba87da5b0f6e31b066a63edd Mon Sep 17 00:00:00 2001 From: irabbani Date: Thu, 24 Jul 2025 22:52:33 +0000 Subject: [PATCH 03/20] Adding a fallback for creating processes inside cgroups with fork/exec instead of clone for older kernel headers < 5.7 (which is what we have in CI) Signed-off-by: irabbani --- .../common/cgroup2/cgroup_driver_interface.h | 1 + .../common/cgroup2/test/cgroup_test_utils.cc | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 3178a8c27008..93fedc5f8ee0 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include #include "ray/common/status.h" diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc index 08281b6a493b..0aa91ad50c73 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.cc +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.cc @@ -102,6 +102,7 @@ ray::StatusOr> TempDirectory::Create() { std::make_unique(std::move(path)); return ray::StatusOr>(std::move(temp_dir)); } + TempDirectory::~TempDirectory() { std::filesystem::remove_all(path_); } ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms) { @@ -148,6 +149,9 @@ ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeou }; return ray::Status::OK(); } + +#ifdef FALSE + ray::StatusOr> StartChildProcessInCgroup( const std::string &cgroup_path) { int cgroup_fd = open(cgroup_path.c_str(), O_RDONLY); @@ -197,6 +201,70 @@ ray::StatusOr> StartChildProcessInCgroup( return ray::StatusOr>({child_pid, static_cast(child_pidfd)}); } +#else +// Fallback for older kernels. Uses fork/exec instead. +ray::StatusOr> StartChildProcessInCgroup( + const std::string &cgroup_path) { + int new_pid = fork(); + + if (new_pid == -1) { + return ray::Status::Invalid( + absl::StrFormat("Failed to fork process with error %s", strerror(errno))); + } + + if (new_pid == 0) { + // Child process will pause and wait for parent to terminate and reap it. + RAY_LOG(ERROR) << "Spawned child in cgroup " << cgroup_path << " with PID " + << getpid(); + pause(); + RAY_LOG(ERROR) << "Unpaused child in cgroup " << cgroup_path << " with PID " + << getpid(); + _exit(0); + } + + std::string cgroup_proc_file_path = cgroup_path + "/cgroup.procs"; + + // Parent process has to move the process into a cgroup. + int cgroup_fd = open(cgroup_proc_file_path.c_str(), O_RDWR); + + if (cgroup_fd == -1) { + return ray::Status::Invalid( + absl::StrFormat("Failed to open cgroup procs file %s with error %s", + cgroup_proc_file_path, + strerror(errno))); + } + + std::string pid_to_write = std::to_string(new_pid); + + if (write(cgroup_fd, pid_to_write.c_str(), pid_to_write.size()) == -1) { + // Best effort killing of the child process. + kill(SIGKILL, new_pid); + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Failed to write pid %i to cgroup procs file %s with error %s", + new_pid, + cgroup_proc_file_path, + strerror(errno))); + } + + close(cgroup_fd); + + int child_pidfd = (int)syscall(SYS_pidfd_open, new_pid, 0); + if (child_pidfd == -1) { + // Best effort killing of the child process. + kill(SIGKILL, new_pid); + close(cgroup_fd); + return ray::Status::Invalid( + absl::StrFormat("Failed to create process fd for pid %i with error %s", + new_pid, + strerror(errno))); + } + + return ray::StatusOr>({new_pid, child_pidfd}); +} + +#endif + TempFile::TempFile(std::string path) { path_ = path; fd_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); // NOLINT From 47930941353f640bebb221dffa732306adaaf0b7 Mon Sep 17 00:00:00 2001 From: irabbani Date: Fri, 25 Jul 2025 16:57:52 +0000 Subject: [PATCH 04/20] adding a pause in the tests to see what's up with the container Signed-off-by: irabbani --- src/ray/common/cgroup2/cgroup_driver_interface.h | 6 +++--- src/ray/common/cgroup2/sysfs_cgroup_driver.h | 2 +- src/ray/common/cgroup2/test/cgroup_test_utils.cc | 2 +- .../cgroup2/test/sysfs_cgroup_driver_integration_test.cc | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 93fedc5f8ee0..54f1048c65cc 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -137,7 +137,7 @@ class CgroupDriverInterface { 1) the cgroup must have the relevant controller enabled e.g. memory.min cannot be enabled if the memory controller is not enabled. 2) the constraint must be supported in Ray (@see supported_constraints_). - 3) the constraint value must be in the correct range (@see supported_constriants_). + 3) the constraint value must be in the correct range (@see supported_constraints_). @param cgroup is an absolute path to the cgroup. @param constraint the name of the constraint. @@ -183,14 +183,14 @@ class CgroupDriverInterface { const std::string &cgroup) = 0; struct Constraint { - std::pair range; + std::pair range; std::string controller; }; protected: const std::unordered_map supported_constraints_ = { {"cpu.weight", {{1, 10000}, "cpu"}}, - {"memory.min", {{0, std::numeric_limits::max()}, "memory"}}, + {"memory.min", {{0, std::numeric_limits::max()}, "memory"}}, }; const std::unordered_set supported_controllers_ = {"cpu", "memory"}; }; diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index fb3354517a5c..c68c0710e935 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -25,7 +25,7 @@ // Used to identify if a filesystem is mounted using cgroupv2. // See: https://docs.kernel.org/admin-guide/cgroup-v2.html#mounting #ifndef CGROUP2_SUPER_MAGIC -#define CGROUP2_SUPER_MAGIC = 0x63677270 +#define CGROUP2_SUPER_MAGIC 0x63677270 #endif namespace ray { diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc index 0aa91ad50c73..05c5dae76d91 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.cc +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.cc @@ -108,7 +108,7 @@ TempDirectory::~TempDirectory() { std::filesystem::remove_all(path_); } ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms) { if (kill(pid, SIGTERM) == -1) { return ray::Status::Invalid( - absl::StrFormat("Failed to send SIGKILL to pid: %i.\n" + absl::StrFormat("Failed to send SIGTERM to pid: %i.\n" "Error: %s", pid, strerror(errno))); diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 377fc5adb7b4..57a7fe5300a4 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -105,6 +105,7 @@ class SysFsCgroupDriverIntegrationTest : public ::testing::Test { cleanup will happen inside the TearDownTestSuite method. */ static void SetUpTestSuite() { + pause(); // 1) Create the testing_cgroup_ under base_cgroup. This is the root node of the // cgroup subtree used by the tests in this file. auto testing_cgroup_or_status = @@ -784,4 +785,4 @@ TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) { Status s = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "500"); ASSERT_TRUE(s.ok()) << s.ToString(); } -}; // namespace ray +} // namespace ray From 85d0ebffb3a9de39b44629e1ef9880d050b4942e Mon Sep 17 00:00:00 2001 From: Ibrahim Rabbani Date: Fri, 25 Jul 2025 10:00:21 -0700 Subject: [PATCH 05/20] Update src/ray/common/cgroup2/cgroup_driver_interface.h Co-authored-by: Edward Oakes Signed-off-by: Ibrahim Rabbani --- src/ray/common/cgroup2/cgroup_driver_interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 54f1048c65cc..8ddcb15802a5 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -96,7 +96,7 @@ class CgroupDriverInterface { virtual Status MoveProcesses(const std::string &from, const std::string &to) = 0; /** - Enables an available controller in a cgroup. A controller can be enabled if the + Enables an available controller on a cgroup. A controller can be enabled if the 1) controller is enabled in the parent of the cgroup. 2) cgroup has no children i.e. it's a leaf node. From 3a5a0203ee8db83000e100d435ef4b12a3e217e0 Mon Sep 17 00:00:00 2001 From: irabbani Date: Fri, 25 Jul 2025 18:21:08 +0000 Subject: [PATCH 06/20] Comments Signed-off-by: irabbani --- .../common/cgroup2/cgroup_driver_interface.h | 2 +- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 5 ++-- src/ray/common/cgroup2/sysfs_cgroup_driver.h | 2 +- .../sysfs_cgroup_driver_integration_test.cc | 24 +++++++++---------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 54f1048c65cc..d20cc11d130b 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -93,7 +93,7 @@ class CgroupDriverInterface { PermissionDenied if current user doesn't have read, write, and execute permissions. Invalid if any errors occur while reading from and writing to cgroups. */ - virtual Status MoveProcesses(const std::string &from, const std::string &to) = 0; + virtual Status MoveAllProcesses(const std::string &from, const std::string &to) = 0; /** Enables an available controller in a cgroup. A controller can be enabled if the diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index 7bafa95c763b..74ab92676331 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -182,7 +182,8 @@ StatusOr> SysFsCgroupDriver::GetEnabledControlle return ReadControllerFile(controller_file_path); } -Status SysFsCgroupDriver::MoveProcesses(const std::string &from, const std::string &to) { +Status SysFsCgroupDriver::MoveAllProcesses(const std::string &from, + const std::string &to) { RAY_RETURN_NOT_OK(CheckCgroup(from)); RAY_RETURN_NOT_OK(CheckCgroup(to)); std::filesystem::path from_procs_file_path = @@ -329,7 +330,7 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, // Check if the constraint value is out of range and therefore invalid. auto [low, high] = constraint_it->second.range; - int value = std::stoi(constraint_value); + size_t value = static_cast(std::stoi(constraint_value)); if (value < low || value > high) { return Status::InvalidArgument(absl::StrFormat( "Failed to apply constraint %s=%s to cgroup %s. %s can only have values " diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index c68c0710e935..353220b7469e 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -159,7 +159,7 @@ class SysFsCgroupDriver : public CgroupDriverInterface { @return InvalidArgument if process files could not be opened, read from, or written to correctly, ok otherwise. */ - Status MoveProcesses(const std::string &from, const std::string &to) override; + Status MoveAllProcesses(const std::string &from, const std::string &to) override; /** The cgroup.subtree_control file has the list of all currently enabled diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 57a7fe5300a4..4874a72bcdd5 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -425,7 +425,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, EXPECT_EQ(controllers.size(), 0); } -TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfSourceDoesntExist) { +TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfSourceDoesntExist) { auto ancestor_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); @@ -438,11 +438,11 @@ TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfSourceDoesntExist) std::string non_existent_path = ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(non_existent_path, dest_cgroup->GetPath()); + Status s = driver.MoveAllProcesses(non_existent_path, dest_cgroup->GetPath()); EXPECT_TRUE(s.IsNotFound()) << s.ToString(); } -TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfDestDoesntExist) { +TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfDestDoesntExist) { auto ancestor_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); @@ -455,12 +455,12 @@ TEST_F(SysFsCgroupDriverIntegrationTest, MoveProcessesFailsIfDestDoesntExist) { std::string non_existent_path = ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(source_cgroup->GetPath(), non_existent_path); + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), non_existent_path); EXPECT_TRUE(s.IsNotFound()) << s.ToString(); } TEST_F(SysFsCgroupDriverIntegrationTest, - MoveProcessesFailsIfNotReadWriteExecPermissionsForSource) { + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForSource) { auto ancestor_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); @@ -474,12 +474,12 @@ TEST_F(SysFsCgroupDriverIntegrationTest, ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); auto dest_cgroup = std::move(dest_cgroup_or_status.value()); SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); } TEST_F(SysFsCgroupDriverIntegrationTest, - MoveProcessesFailsIfNotReadWriteExecPermissionsForDest) { + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForDest) { auto ancestor_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); @@ -493,12 +493,12 @@ TEST_F(SysFsCgroupDriverIntegrationTest, ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); auto dest_cgroup = std::move(dest_cgroup_or_status.value()); SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); } TEST_F(SysFsCgroupDriverIntegrationTest, - MoveProcessesFailsIfNotReadWriteExecPermissionsForAncestor) { + MoveAllProcessesFailsIfNotReadWriteExecPermissionsForAncestor) { auto ancestor_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); @@ -515,7 +515,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() << "\n Error: " << strerror(errno); SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); // Change the permissions back read, write, and execute so cgroup can be deleted. ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRWXU), 0) @@ -524,7 +524,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, } TEST_F(SysFsCgroupDriverIntegrationTest, - MoveProcessesSucceedsWithCorrectPermissionsAndValidCgroups) { + MoveAllProcessesSucceedsWithCorrectPermissionsAndValidCgroups) { auto source_cgroup_or_status = TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); @@ -538,7 +538,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, ASSERT_TRUE(child_process_s.ok()) << child_process_s.ToString(); auto [child_pid, child_pidfd] = child_process_s.value(); SysFsCgroupDriver driver; - Status s = driver.MoveProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); + Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); ASSERT_TRUE(s.ok()) << s.ToString(); // Assert that the child's pid is actually in the new file. std::string dest_cgroup_procs_file_path = dest_cgroup->GetPath() + From f52354bc109ad04778809878d1deab893f6ad641 Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 16:32:57 +0000 Subject: [PATCH 07/20] Putting the cgroupv2 tests into a separate target Signed-off-by: irabbani --- .buildkite/core.rayci.yml | 10 ++++++++++ src/ray/common/cgroup2/test/BUILD | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index c820567a46ec..ece626c6f328 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -305,6 +305,16 @@ steps: - "3.13" # cpp tests + - label: ":ray: core: cgroupv2 tests" + tags: core_cpp + instance_type: medium + commands: + - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --only-tags=cgroupv2 + --cache-test-results + --build-name oss-ci-base_test + --build-type cgroup + --privileged + - label: ":ray: core: cgroup tests" tags: core_cpp instance_type: medium diff --git a/src/ray/common/cgroup2/test/BUILD b/src/ray/common/cgroup2/test/BUILD index 276816811613..a5ce4a09c855 100644 --- a/src/ray/common/cgroup2/test/BUILD +++ b/src/ray/common/cgroup2/test/BUILD @@ -15,7 +15,7 @@ ray_cc_test( name = "sysfs_cgroup_driver_integration_test", srcs = ["sysfs_cgroup_driver_integration_test.cc"], tags = [ - "cgroup", + "cgroupv2", "exclusive", "no_windows", "team:core", From 148d04d18c3e0c50e222d7b895007b2fc15163ac Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 16:52:40 +0000 Subject: [PATCH 08/20] removing test sleep Signed-off-by: irabbani --- .../common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 4874a72bcdd5..cff87662e4c0 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -105,7 +105,6 @@ class SysFsCgroupDriverIntegrationTest : public ::testing::Test { cleanup will happen inside the TearDownTestSuite method. */ static void SetUpTestSuite() { - pause(); // 1) Create the testing_cgroup_ under base_cgroup. This is the root node of the // cgroup subtree used by the tests in this file. auto testing_cgroup_or_status = From d3f8b7918cb6f1c94aecd1f5d9087e011f7aa997 Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 17:42:22 +0000 Subject: [PATCH 09/20] Removing a docstring Signed-off-by: irabbani --- src/ray/common/cgroup2/cgroup_driver_interface.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 94f4db10948e..c8fdd45e7b01 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -23,14 +23,13 @@ namespace ray { /** - A utility interface that allows the caller to perform cgroup operations. - TODO(irabbani): I need to figure out the following - 1. What is the appropriate level of documentation here? This header file is what's - important by consumers of the API and it needs to provide the correct docstrings - and API. - 2. Revisiting this API to make it sure it integrates with a possible systemd - driver implementation (meaning it cannot have any directory based info or any dbus - implementation). + A utility interface that allows the caller to check if cgroupv2 is mounted correctly + and perform cgroup operations on the system. It currently only supports using + the memory and cpu controllers. It only supports the memory.min and cpu.weight + constraints. + + @see The cgroupv2 documentation for more details: + https://docs.kernel.org/admin-guide/cgroup-v2.html */ class CgroupDriverInterface { public: From d76ff150e5cd98752d1c579c30ecaa3937f47d8f Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 18:34:17 +0000 Subject: [PATCH 10/20] enabling CI tests Signed-off-by: irabbani --- src/ray/common/cgroup2/BUILD | 3 ++- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 23 ++++++------------- .../sysfs_cgroup_driver_integration_test.cc | 2 +- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/ray/common/cgroup2/BUILD b/src/ray/common/cgroup2/BUILD index 758d5579ace2..c1ea2a9dbef8 100644 --- a/src/ray/common/cgroup2/BUILD +++ b/src/ray/common/cgroup2/BUILD @@ -22,7 +22,8 @@ ray_cc_library( "//src/ray/common:status", "//src/ray/common:status_or", "//src/ray/util:logging", - "//src/ray/util:process", + "@com_google_absl//absl/strings:str_cat", "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/strings:str_join", ], ) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index 74ab92676331..cd76addfa90a 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -34,6 +34,7 @@ #include #include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "ray/common/status.h" #include "ray/common/status_or.h" @@ -67,11 +68,11 @@ Status SysFsCgroupDriver::CheckCgroupv2Enabled() { mount_file_path_)); } - // TODO(irabbani): provide instructions for how to mount correctly and link to the - // correct docs. if (found_cgroupv1 && found_cgroupv2) { return Status::Invalid("Cgroupv1 and cgroupv2 are both mounted. Unmount cgroupv1."); } else if (found_cgroupv1 && !found_cgroupv2) { + // TODO(irabbani): provide a link to the ray documentation once it's been written + // for how to troubleshoot these. return Status::Invalid( "Cgroupv1 is mounted and cgroupv2 is not mounted. " "Unmount cgroupv1 and mount cgroupv2."); @@ -227,13 +228,8 @@ Status SysFsCgroupDriver::EnableController(const std::string &cgroup_path, auto available_controllers = available_controllers_s.value(); if (available_controllers.find(controller) == available_controllers.end()) { - std::string available_controllers_str("["); - std::for_each(available_controllers.begin(), - available_controllers.end(), - [&available_controllers_str](const std::string &ctrl) { - available_controllers_str.append(ctrl); - }); - available_controllers_str.append("]"); + std::string enabled_controllers_str = + absl::StrCat("[", absl::StrJoin(enabled_controllers, ", "), "]"); return Status::InvalidArgument(absl::StrFormat( "Controller %s is not available for cgroup at path %s.\n" "Current available controllers are %s. " @@ -276,13 +272,8 @@ Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path, auto enabled_controllers = enabled_controllers_s.value(); if (enabled_controllers.find(controller) == enabled_controllers.end()) { - std::string enabled_controllers_str("["); - std::for_each(enabled_controllers.begin(), - enabled_controllers.end(), - [&enabled_controllers_str](const std::string &ctrl) { - enabled_controllers_str.append(ctrl); - }); - enabled_controllers_str.append("]"); + std::string available_controllers_str = + absl::StrCat("[", absl::StrJoin(available_controllers, ", "), "]"); return Status::InvalidArgument( absl::StrFormat("Controller %s is not enabled for cgroup at path %s.\n" "Current enabled controllers are %s. ", diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index cff87662e4c0..a2b5834ebd8a 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -56,7 +56,7 @@ echo "+memory" > /sys/fs/cgroup/testing/cgroup.subtree_control */ // Uncomment the following line to run the tests locally. -#define RUN_LOCALLY +//#define RUN_LOCALLY // This the root of the cgroup subtree that has been delegated to the testing user. std::unique_ptr testing_cgroup_; From 2798ea5da16e5c8db844703c00e214860b20332c Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 22:45:49 +0000 Subject: [PATCH 11/20] fixing absl imports Signed-off-by: irabbani --- src/ray/common/cgroup2/BUILD | 6 +++--- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 8 ++++---- .../cgroup2/test/sysfs_cgroup_driver_integration_test.cc | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ray/common/cgroup2/BUILD b/src/ray/common/cgroup2/BUILD index c1ea2a9dbef8..8b61b1f1110a 100644 --- a/src/ray/common/cgroup2/BUILD +++ b/src/ray/common/cgroup2/BUILD @@ -22,8 +22,8 @@ ray_cc_library( "//src/ray/common:status", "//src/ray/common:status_or", "//src/ray/util:logging", - "@com_google_absl//absl/strings:str_cat", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/strings:str_join", + "@com_google_absl//absl/strings", + # "@com_google_absl//absl/strings:str_format", + # "@com_google_absl//absl/strings:str_join", ], ) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index cd76addfa90a..752589943222 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -229,7 +229,7 @@ Status SysFsCgroupDriver::EnableController(const std::string &cgroup_path, if (available_controllers.find(controller) == available_controllers.end()) { std::string enabled_controllers_str = - absl::StrCat("[", absl::StrJoin(enabled_controllers, ", "), "]"); + absl::StrCat("[", absl::StrJoin(available_controllers, ", "), "]"); return Status::InvalidArgument(absl::StrFormat( "Controller %s is not available for cgroup at path %s.\n" "Current available controllers are %s. " @@ -237,7 +237,7 @@ Status SysFsCgroupDriver::EnableController(const std::string &cgroup_path, "the root cgroup to X must have the controller enabled.", controller, cgroup_path, - available_controllers_str)); + enabled_controllers_str)); } std::filesystem::path enabled_ctrls_file = @@ -272,8 +272,8 @@ Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path, auto enabled_controllers = enabled_controllers_s.value(); if (enabled_controllers.find(controller) == enabled_controllers.end()) { - std::string available_controllers_str = - absl::StrCat("[", absl::StrJoin(available_controllers, ", "), "]"); + std::string enabled_controllers_str = + absl::StrCat("[", absl::StrJoin(enabled_controllers, ", "), "]"); return Status::InvalidArgument( absl::StrFormat("Controller %s is not enabled for cgroup at path %s.\n" "Current enabled controllers are %s. ", diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index a2b5834ebd8a..220776f03dbc 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -56,7 +56,7 @@ echo "+memory" > /sys/fs/cgroup/testing/cgroup.subtree_control */ // Uncomment the following line to run the tests locally. -//#define RUN_LOCALLY +#define RUN_LOCALLY // This the root of the cgroup subtree that has been delegated to the testing user. std::unique_ptr testing_cgroup_; @@ -80,7 +80,7 @@ static inline std::string base_cgroup_path_ = "/sys/fs/cgroup/testing"; static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup/testing/active"; #else static inline std::string base_cgroup_path_ = "/sys/fs/cgroup"; -static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup"; +static inline std::string processescgroup_path_ = "/sys/fs/cgroup"; #endif /** From 3fda50573c30f69609a0aa03cffeeeb884ea05c0 Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 22:46:07 +0000 Subject: [PATCH 12/20] commenting local Signed-off-by: irabbani --- .../common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 220776f03dbc..55c03338a13d 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -56,7 +56,7 @@ echo "+memory" > /sys/fs/cgroup/testing/cgroup.subtree_control */ // Uncomment the following line to run the tests locally. -#define RUN_LOCALLY +//#define RUN_LOCALLY // This the root of the cgroup subtree that has been delegated to the testing user. std::unique_ptr testing_cgroup_; From 9e1e93123886d21c759b7d64dea2aa51b65e565b Mon Sep 17 00:00:00 2001 From: irabbani Date: Tue, 29 Jul 2025 23:52:37 +0000 Subject: [PATCH 13/20] doxygen formatting Signed-off-by: irabbani --- .../common/cgroup2/cgroup_driver_interface.h | 94 ++++++++++--------- .../sysfs_cgroup_driver_integration_test.cc | 2 +- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index c8fdd45e7b01..0a576bbcf039 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -23,10 +23,9 @@ namespace ray { /** - A utility interface that allows the caller to check if cgroupv2 is mounted correctly - and perform cgroup operations on the system. It currently only supports using - the memory and cpu controllers. It only supports the memory.min and cpu.weight - constraints. + A utility that allows the caller to check if cgroupv2 is mounted correctly + and perform cgroup operations on the system. It supports the memory and cpu controllers + with the memory.min and cpu.weight constraints respectively. @see The cgroupv2 documentation for more details: https://docs.kernel.org/admin-guide/cgroup-v2.html @@ -46,7 +45,8 @@ class CgroupDriverInterface { @see K8S documentation on how to enable cgroupv2 and check if it's enabled correctly: https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support - @return OK if successful, otherwise Invalid. + @return Status::OK if successful, + @return Status::Invalid if cgroupv2 is not enabled correctly. */ virtual Status CheckCgroupv2Enabled() = 0; @@ -56,10 +56,11 @@ class CgroupDriverInterface { @param cgroup the absolute path of the cgroup. - @return OK if no errors are encounted. Otherwise, one of the following errors - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - InvalidArgument if the cgroup is not using cgroupv2. + @return Status::OK if no errors are encounted. Otherwise, one of the following errors + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2. */ virtual Status CheckCgroup(const std::string &cgroup) = 0; @@ -70,10 +71,11 @@ class CgroupDriverInterface { @param cgroup is an absolute path to the cgroup - @return OK if no errors are encounted. Otherwise, one of the following errors - NotFound if an ancestor cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - AlreadyExists if the cgroup already exists. + @return Status::OK if no errors are encounted. Otherwise, one of the following errors + @return Status::NotFound if an ancestor cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::AlreadyExists if the cgroup already exists. */ virtual Status CreateCgroup(const std::string &cgroup) = 0; @@ -87,10 +89,12 @@ class CgroupDriverInterface { @param from the absolute path of the cgroup to migrate processes out of. @param to the absolute path of the cgroup to migrate processes into. - @return OK if no errors are encounted. Otherwise, one of the following errors - NotFound if to or from don't exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - Invalid if any errors occur while reading from and writing to cgroups. + @return Status::OK if no errors are encounted. Otherwise, one of the following errors + @return Status::NotFound if to or from don't exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::Invalid if any errors occur while reading from and writing to + cgroups. */ virtual Status MoveAllProcesses(const std::string &from, const std::string &to) = 0; @@ -105,12 +109,13 @@ class CgroupDriverInterface { @see No Internal Process Constraint for more details: https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint - @return OK if successful, otherwise one of the following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions for - the cgroup. - InvalidArgument if the controller is not available or if cgroup is not a cgroupv2. - Invalid for all other failures. + @return Status::OK if successful, otherwise one of the following + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions for the cgroup. + @return Status::InvalidArgument if the controller is not available or if cgroup is not + a cgroupv2. + @return Status::Invalid for all other failures. */ virtual Status EnableController(const std::string &cgroup, const std::string &controller) = 0; @@ -122,12 +127,12 @@ class CgroupDriverInterface { @param cgroup is an absolute path to the cgroup. @param controller is the name of the controller (e.g. "cpu" and not "-cpu") - @return OK if successful, otherwise one of the following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions for - the cgroup. - InvalidArgument if the controller is not enabled or if cgroup is not a cgroupv2. - Invalid for all other failures. + @return Status::OK if successful, otherwise one of the following + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions for the cgroup. + @return Status::InvalidArgument if the controller is not enabled + or if cgroup is not a cgroupv2. Status::Invalid for all other failures. */ virtual Status DisableController(const std::string &cgroup, const std::string &controller) = 0; @@ -142,12 +147,12 @@ class CgroupDriverInterface { @param constraint the name of the constraint. @param value the value of the constraint. - @return OK if successful, otherwise one of the following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions for - the cgroup. - InvalidArgument if the cgroup is not valid or constraint is not supported or the value - not correct. + @return Status::OK if successful, otherwise one of the following + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions for the cgroup. + @return Status::InvalidArgument if the cgroup is not valid or constraint is not + supported or the value not correct. */ virtual Status AddConstraint(const std::string &cgroup, const std::string &constraint, @@ -158,11 +163,13 @@ class CgroupDriverInterface { @param cgroup absolute path of the cgroup. - @returns OK with a set of controllers if successful, otherwise one of following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute + @return Status::OK with a set of controllers if successful, otherwise one of + following + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute permissions. - InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + @return Status::InvalidArgument if the cgroup is not using cgroupv2 or malformed + controllers file. */ virtual StatusOr> GetAvailableControllers( const std::string &cgroup) = 0; @@ -172,11 +179,12 @@ class CgroupDriverInterface { @param cgroup absolute path of the cgroup. - @returns OK with a set of controllers if successful, otherwise one of following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute + @return Status::OK with a set of controllers if successful, otherwise one of following + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute permissions. - InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + @return Status::InvalidArgument if the cgroup is not using cgroupv2 or malformed + controllers file. */ virtual StatusOr> GetEnabledControllers( const std::string &cgroup) = 0; diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc index 55c03338a13d..a2b5834ebd8a 100644 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc @@ -80,7 +80,7 @@ static inline std::string base_cgroup_path_ = "/sys/fs/cgroup/testing"; static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup/testing/active"; #else static inline std::string base_cgroup_path_ = "/sys/fs/cgroup"; -static inline std::string processescgroup_path_ = "/sys/fs/cgroup"; +static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup"; #endif /** From e6b4926afe8c8ab553006102b0dee3fde0b78650 Mon Sep 17 00:00:00 2001 From: irabbani Date: Wed, 30 Jul 2025 15:13:16 +0000 Subject: [PATCH 14/20] removing integration tests Signed-off-by: irabbani --- src/ray/common/cgroup2/test/BUILD | 20 - .../common/cgroup2/test/cgroup_test_utils.cc | 206 ----- .../common/cgroup2/test/cgroup_test_utils.h | 67 -- .../sysfs_cgroup_driver_integration_test.cc | 787 ------------------ 4 files changed, 1080 deletions(-) delete mode 100644 src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc diff --git a/src/ray/common/cgroup2/test/BUILD b/src/ray/common/cgroup2/test/BUILD index a5ce4a09c855..7157feb4a7dd 100644 --- a/src/ray/common/cgroup2/test/BUILD +++ b/src/ray/common/cgroup2/test/BUILD @@ -11,26 +11,6 @@ ray_cc_library( ], ) -ray_cc_test( - name = "sysfs_cgroup_driver_integration_test", - srcs = ["sysfs_cgroup_driver_integration_test.cc"], - tags = [ - "cgroupv2", - "exclusive", - "no_windows", - "team:core", - ], - deps = [ - ":cgroup_test_utils", - "//src/ray/common:status", - "//src/ray/common:status_or", - "//src/ray/common/cgroup2:sysfs_cgroup_driver", - "//src/ray/common/test:testing", - "@com_google_absl//absl/strings:str_format", - "@com_google_googletest//:gtest_main", - ], -) - ray_cc_test( name = "sysfs_cgroup_driver_test", srcs = ["sysfs_cgroup_driver_test.cc"], diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.cc b/src/ray/common/cgroup2/test/cgroup_test_utils.cc index 05c5dae76d91..e61ad82e633c 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.cc +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.cc @@ -42,52 +42,6 @@ #include "ray/common/status_or.h" #include "ray/util/logging.h" -std::string GenerateRandomFilename(size_t len) { - std::string output; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_int_distribution dist(0, 61); - output.reserve(len); - for (size_t i = 0; i < len; ++i) { - int randomChar = dist(mt); - if (randomChar < 26) { - output.push_back('a' + randomChar); - } else if (randomChar < 26 + 26) { - output.push_back('A' + randomChar - 26); - } else { - output.push_back('0' + randomChar - 52); - } - } - return output; -} - -ray::StatusOr> TempCgroupDirectory::Create( - const std::string &base_path, mode_t mode) { - std::string name = GenerateRandomFilename(kCgroupNameLength); - std::string path = base_path + std::filesystem::path::preferred_separator + name; - if (mkdir(path.c_str(), mode) == -1) { - return ray::Status::IOError( - absl::StrFormat("Failed to create cgroup directory at path %s.\n" - "Cgroup tests expect tmpfs and cgroupv2 to be mounted " - "and only run on Linux.\n" - "Error: %s", - path, - strerror(errno))); - } - auto output = std::make_unique(std::move(name), std::move(path)); - return ray::StatusOr>(std::move(output)); -} - -TempCgroupDirectory::~TempCgroupDirectory() noexcept(false) { - if (rmdir(path_.c_str()) != 0) { - throw std::runtime_error( - absl::StrFormat("Failed to delete a cgroup directory at %s. Please manually " - "delete it with rmdir. \n%s", - path_, - strerror(errno))); - } -} - ray::StatusOr> TempDirectory::Create() { std::string path = "/tmp/XXXXXX"; char *ret = mkdtemp(path.data()); @@ -105,166 +59,6 @@ ray::StatusOr> TempDirectory::Create() { TempDirectory::~TempDirectory() { std::filesystem::remove_all(path_); } -ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms) { - if (kill(pid, SIGTERM) == -1) { - return ray::Status::Invalid( - absl::StrFormat("Failed to send SIGTERM to pid: %i.\n" - "Error: %s", - pid, - strerror(errno))); - } - struct pollfd poll_fd = { - .fd = fd, - .events = POLLIN, - }; - - int poll_status = poll(&poll_fd, 1, timeout_ms); - if (poll_status == -1) { - return ray::Status::Invalid(absl::StrFormat( - "Failed to poll process pid: %i, fd: %i. Process was not killed. Please " - "kill it manually to prevent a leak.\n" - "Error: %s", - pid, - fd, - strerror(errno))); - } - if (poll_status == 0) { - return ray::Status::Invalid(absl::StrFormat( - "Process pid: %i, fd:%i was not killed within the timeout of %ims.", - pid, - fd, - timeout_ms)); - } - siginfo_t dummy = {0}; - int wait_id_status = waitid(P_PID, static_cast(fd), &dummy, WEXITED); - if (wait_id_status == -1) { - if (errno != ECHILD) - return ray::Status::Invalid(absl::StrFormat( - "Failed to wait for process pid: %i, fd: %i. Process was not reaped, but " - "it will be reaped by init after program exits.\n" - "Error: %s", - pid, - fd, - strerror(errno))); - }; - return ray::Status::OK(); -} - -#ifdef FALSE - -ray::StatusOr> StartChildProcessInCgroup( - const std::string &cgroup_path) { - int cgroup_fd = open(cgroup_path.c_str(), O_RDONLY); - if (cgroup_fd == -1) { - return ray::Status::Invalid( - absl::StrFormat("Unable to open fd for cgroup at %s.\n" - "Error: %s", - cgroup_path, - strerror(errno))); - } - - // Will be set by clone3 if a child process is successfully created. - pid_t child_pidfd = -1; - - clone_args cl_args = {}; - cl_args.flags = CLONE_PIDFD | CLONE_INTO_CGROUP; - cl_args.cgroup = cgroup_fd; - - // Can be used both as a pid and as a fd. - cl_args.pidfd = ((__u64)((uintptr_t)(&child_pidfd))); - - int child_pid = -1; - - if ((child_pid = syscall(__NR_clone3, &cl_args, sizeof(struct clone_args))) == -1) { - RAY_LOG(ERROR) << "clone3 failed with error with error: " << strerror(errno); - close(cgroup_fd); - return ray::Status::Invalid( - absl::StrFormat("Unable to clone process.\n" - "Error: %s", - strerror(errno))); - } - - // Child process will execute this. - if (child_pid == 0) { - RAY_LOG(ERROR) << "Spawned child in cgroup " << cgroup_path << " with PID " - << getpid(); - pause(); - RAY_LOG(ERROR) << "Unpaused child in cgroup " << cgroup_path << " with PID " - << getpid(); - _exit(0); - } - - // Parent process will continue here. - close(cgroup_fd); - RAY_LOG(ERROR) << "Successfully cloned with parent_pid=" << getpid() - << ", child_pid=" << child_pid << ", child_pidfd=" << child_pidfd << "."; - return ray::StatusOr>({child_pid, static_cast(child_pidfd)}); -} - -#else -// Fallback for older kernels. Uses fork/exec instead. -ray::StatusOr> StartChildProcessInCgroup( - const std::string &cgroup_path) { - int new_pid = fork(); - - if (new_pid == -1) { - return ray::Status::Invalid( - absl::StrFormat("Failed to fork process with error %s", strerror(errno))); - } - - if (new_pid == 0) { - // Child process will pause and wait for parent to terminate and reap it. - RAY_LOG(ERROR) << "Spawned child in cgroup " << cgroup_path << " with PID " - << getpid(); - pause(); - RAY_LOG(ERROR) << "Unpaused child in cgroup " << cgroup_path << " with PID " - << getpid(); - _exit(0); - } - - std::string cgroup_proc_file_path = cgroup_path + "/cgroup.procs"; - - // Parent process has to move the process into a cgroup. - int cgroup_fd = open(cgroup_proc_file_path.c_str(), O_RDWR); - - if (cgroup_fd == -1) { - return ray::Status::Invalid( - absl::StrFormat("Failed to open cgroup procs file %s with error %s", - cgroup_proc_file_path, - strerror(errno))); - } - - std::string pid_to_write = std::to_string(new_pid); - - if (write(cgroup_fd, pid_to_write.c_str(), pid_to_write.size()) == -1) { - // Best effort killing of the child process. - kill(SIGKILL, new_pid); - close(cgroup_fd); - return ray::Status::Invalid( - absl::StrFormat("Failed to write pid %i to cgroup procs file %s with error %s", - new_pid, - cgroup_proc_file_path, - strerror(errno))); - } - - close(cgroup_fd); - - int child_pidfd = (int)syscall(SYS_pidfd_open, new_pid, 0); - if (child_pidfd == -1) { - // Best effort killing of the child process. - kill(SIGKILL, new_pid); - close(cgroup_fd); - return ray::Status::Invalid( - absl::StrFormat("Failed to create process fd for pid %i with error %s", - new_pid, - strerror(errno))); - } - - return ray::StatusOr>({new_pid, child_pidfd}); -} - -#endif - TempFile::TempFile(std::string path) { path_ = path; fd_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); // NOLINT diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.h b/src/ray/common/cgroup2/test/cgroup_test_utils.h index ef79dd183741..222ed385160b 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.h +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.h @@ -22,39 +22,6 @@ #include "ray/common/status.h" #include "ray/common/status_or.h" -static constexpr size_t kCgroupNameLength = 6; - -// Generates an ASCII string of the given length. -std::string GenerateRandomFilename(size_t len); - -/** - Creates an RAII style cgroup directory with a random name at a given - path with the given permissions. - */ -class TempCgroupDirectory { - public: - static ray::StatusOr> Create( - const std::string &base_path, mode_t mode = 0777); - - TempCgroupDirectory() = default; - explicit TempCgroupDirectory(std::string &&name, std::string &&path) - : name_(name), path_(path) {} - - TempCgroupDirectory(const TempCgroupDirectory &) = delete; - TempCgroupDirectory(TempCgroupDirectory &&) = delete; - TempCgroupDirectory &operator=(const TempCgroupDirectory &) = delete; - TempCgroupDirectory &operator=(TempCgroupDirectory &&) = delete; - - const std::string &GetPath() const { return path_; } - const std::string &GetName() const { return name_; } - - ~TempCgroupDirectory() noexcept(false); - - private: - std::string name_; - std::string path_; -}; - /** RAII style class for creating and destroying temporary directory for testing. TODO(irabbani): add full documentation once complete. @@ -101,37 +68,3 @@ class TempFile { std::ofstream file_output_stream_; int fd_; }; - -//// Starts a process in the cgroup and returns the pid of the started process. -//// Returns -1 if there's an error. -//// Note: clone3 supports creating a process inside a cgroup instead of creating -//// and then moving. However, clone3 does not have a glibc wrapper and -//// must be called directly using syscall syscall (see man 2 syscall). -//// This function needs linux kernel >= 5.7 to use the CLONE_INTO_CGROUP flag. -//// It is the caller's responsibility to terminate the child process and -//// wait for the pid. Use TerminateChildProcessAndWaitForTimeout. -//// Returns a process_fd. See the CLONE_PIDFD section in "man 2 clone" -ray::StatusOr> StartChildProcessInCgroup( - const std::string &cgroup_path); - -/** - @param process_fd can be used as a fd and as a pid. It can be created using - clone or pidfd_open. - @param timeout_ms time used to poll for the process_fd to detect the process - has been killed. - @return ok if successfully terminated the process and reaped it. Invalid otherwise - with the correct error message. - */ -ray::Status TerminateChildProcessAndWaitForTimeout(pid_t pid, int fd, int timeout_ms); - -std::ostream &operator<<(std::ostream &os, const TempCgroupDirectory &temp_cgroup_dir) { - return os << temp_cgroup_dir.GetPath(); -} - -std::ostream &operator<<(std::ostream &os, - const std::unique_ptr &ptr) { - if (ptr == nullptr) { - return os << ""; - } - return os << *ptr; -} diff --git a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc b/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc deleted file mode 100644 index a2b5834ebd8a..000000000000 --- a/src/ray/common/cgroup2/test/sysfs_cgroup_driver_integration_test.cc +++ /dev/null @@ -1,787 +0,0 @@ -// Copyright 2025 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "ray/common/cgroup2/sysfs_cgroup_driver.h" -#include "ray/common/cgroup2/test/cgroup_test_utils.h" -#include "ray/common/status.h" -#include "ray/common/status_or.h" - -/** - NOTE: Read these instructions before running these tests locally. - These tests are only supported on linux with cgroupv2 enabled in unified mode. - - See the following documentation for how to enable cgroupv2 properly: - https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support - - When running these tests, the process that starts the tests (usually your terminal) - needs to be inside a cgroup hierarchy that it has ownership of. Run the following - bash commands to make that happen: - - # Create the cgroup heirarchy (these need to match base_cgroup_path_ and - # processes_cgroup_path_ which are defined below) - sudo mkdir /sys/fs/cgroup/testing - sudo mkdir /sys/fs/cgroup/active - - # Give ownership and permissions to the current user. - USER=`whoami` - sudo chown -R USER:USER /sys/fs/cgroup/testing - sudo chmod -R u+rwx /sys/fs/cgroup/testing - - # Move the current process into a leaf cgroup within the hierarchy. - echo $$ | sudo tee -a /sys/fs/cgroup/testing/active/cgroup.procs - - # Enable cpu and memory controllers on the parent cgroup - echo "+cpu" > /sys/fs/cgroup/testing/cgroup.subtree_control - echo "+memory" > /sys/fs/cgroup/testing/cgroup.subtree_control -*/ -// Uncomment the following line to run the tests locally. -//#define RUN_LOCALLY - -// This the root of the cgroup subtree that has been delegated to the testing user. -std::unique_ptr testing_cgroup_; - -// This a sibling to the testing_cgroup_ that will store all processes that were in -// the base_cgroup. This will allow the testing_cgroup_ to have subcgroups with -// controllers enabled to satisfy the no internal process constraint. -// See the no internal process constraint section for more information: -// https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint -std::unique_ptr leaf_cgroup_; - -// base_cgroup_path_ is the root of the cgroup heirarchy that will be used for testing. -// The tests will expect that this cgroup has only the cpu and memory controllers -// enabled because they will be used for testing. -// processes_cgroup_path_ is used when running tests locally to start the test process -// in a leaf node inside the cgroup hierarchy. It's not needed in CI because the user -// inside the container has root access and can manage the entire hierarchy. This implies -// that the CI test process is always started within a cgroup hierarchy that it owns. -#ifdef RUN_LOCALLY -static inline std::string base_cgroup_path_ = "/sys/fs/cgroup/testing"; -static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup/testing/active"; -#else -static inline std::string base_cgroup_path_ = "/sys/fs/cgroup"; -static inline std::string processes_cgroup_path_ = "/sys/fs/cgroup"; -#endif - -/** - This test suite will create the following cgroup hierarchy for the tests - base_cgroup - / \ - testing_cgroup leaf_cgroup - - It expects that root_cgroup has cpu, memory controllers enabled. - It enables the cpu, memory controllers on the testing_cgroup. - Tests should not access anything above the testing_cgroup. - The leaf_cgroup provides a leaf node clone processes into while not - violating the no internal processes constraint. - - Note: testing_cgroup and leaf_cgroup have randomly generating names - to isolate test runs from each other. - */ -class SysFsCgroupDriverIntegrationTest : public ::testing::Test { - protected: - /** - This can fail an assertion without running fully. In that case, best-effort - cleanup will happen inside the TearDownTestSuite method. - */ - static void SetUpTestSuite() { - // 1) Create the testing_cgroup_ under base_cgroup. This is the root node of the - // cgroup subtree used by the tests in this file. - auto testing_cgroup_or_status = - TempCgroupDirectory::Create(base_cgroup_path_, S_IRWXU); - ASSERT_TRUE(testing_cgroup_or_status.ok()) << testing_cgroup_or_status.ToString(); - testing_cgroup_ = std::move(testing_cgroup_or_status.value()); - - // 2) Create the leaf_cgroup_ under the base_cgroup. This will be used to store - // all the processes in the base_cgroup. - auto leaf_cgroup_or_status = TempCgroupDirectory::Create(base_cgroup_path_, S_IRWXU); - ASSERT_TRUE(leaf_cgroup_or_status.ok()) << leaf_cgroup_or_status.ToString(); - leaf_cgroup_ = std::move(leaf_cgroup_or_status.value()); - - // 3) Move all processes from the /cgroup.procs file into the - ///cgroup.procs file. This will allow the base_cgroup to have children. - std::string processes_cgroup_procs_file_path = - processes_cgroup_path_ + "/cgroup.procs"; - std::string leaf_cgroup_procs_file_path = leaf_cgroup_->GetPath() + "/cgroup.procs"; - FILE *processes_cgroup_procs = fopen(processes_cgroup_procs_file_path.c_str(), "r+"); - ASSERT_NE(processes_cgroup_procs, nullptr) - << "Failed to open processes cgroup's proc file at path " - << processes_cgroup_procs_file_path << " with error " << strerror(errno); - int leaf_cgroup_procs_fd = open(leaf_cgroup_procs_file_path.c_str(), O_RDWR); - ASSERT_NE(leaf_cgroup_procs_fd, -1) - << "Failed to open leaf cgroup's proc file at path " - << leaf_cgroup_procs_file_path << " with error " << strerror(errno); - - // According to man 5 proc, the maximum value of a pid on a 64-bit system is 2^22 - // i.e./4194304. fgets adds a null terminating character. - char read_buffer[32]; - - while (fgets(read_buffer, sizeof(read_buffer), processes_cgroup_procs)) { - ssize_t bytes_written = - write(leaf_cgroup_procs_fd, read_buffer, strlen(read_buffer)); - - // If the process exited between the call to fgets and write, write will - // return a NoSuchProcess error. - if (bytes_written == -1 && errno == ESRCH) { - continue; - } - ASSERT_EQ(bytes_written, strlen(read_buffer)) - << "Failed to write to leaf cgroups proc file at path " - << leaf_cgroup_->GetPath() << " with error " << strerror(errno); - } - ASSERT_EQ(ferror(processes_cgroup_procs), 0) - << "Failed to read pid from processes cgroups proc file at path " - << processes_cgroup_procs_file_path << " with error " << strerror(errno); - - fclose(processes_cgroup_procs); - close(leaf_cgroup_procs_fd); - - // 4) Enable cpu and memory controllers for the testing_cgroup to allow tests to - // use those controllers as necessary. - std::string test_cgroup_subtree_control_path = - testing_cgroup_->GetPath() + "/cgroup.subtree_control"; - std::unordered_set subtree_control_ops = {"+cpu", "+memory"}; - for (const auto &op : subtree_control_ops) { - int test_cgroup_subtree_control_fd = - open(test_cgroup_subtree_control_path.c_str(), O_RDWR); - ASSERT_NE(test_cgroup_subtree_control_fd, -1) - << "Failed to open test cgroup's subtree_control file at path " - << test_cgroup_subtree_control_path << " with error " << strerror(errno); - ssize_t num_written = write(test_cgroup_subtree_control_fd, op.c_str(), op.size()); - ASSERT_EQ(num_written, op.size()) - << "Failed to write to test cgroup's subtree_control file at path " - << test_cgroup_subtree_control_path << " with error " << strerror(errno); - close(test_cgroup_subtree_control_fd); - } - } - - /** - This will attempt to do best effort cleanup by reversing the effects of the - SetUpTestSuite method. Cleanup doesn't need to delete cgroup directories because of - TempCgroupDirectory's dtor will take care of it. Cleanup doesn't need to disable - controllers because a cgroup can be deleted even if it has controllers enabled. - - If cleanup fails, cgroup directories may not be cleaned up properly. However, despite - cleanup failure, tests can be rerun on the same machine/container without interference - from the previous failed attempt. - - There are a few types of leaks possible - 1) In CI, the tests run inside the container and all processes inside the container - will have been moved into the /leaf/cgroup.procs file. They may stay - there. This has no impact on subsequent runs because this cgroup should have no - constriants. - 2) In all environments, cgroups may be leaked and will have to be manually cleaned - up. Typically, a cgroup cannot be removed iff it has a running process in the - cgroup.procs file or child cgroups. - - NOTE: The teardown method is called even if assertions fail in the setup or in - tests in the suite. - */ - static void TearDownTestSuite() { - // Move all processes from the /cgroup.procs file into the - ///cgroup.procs file. This will allow the leaf_cgroup to be deleted. - std::string processes_cgroup_procs_file_path = - processes_cgroup_path_ + "/cgroup.procs"; - std::string leaf_cgroup_procs_file_path = leaf_cgroup_->GetPath() + "/cgroup.procs"; - FILE *leaf_cgroup_procs = fopen(leaf_cgroup_procs_file_path.c_str(), "r+"); - ASSERT_NE(leaf_cgroup_procs, nullptr) - << "Failed to open leaf cgroup's proc file at path " << leaf_cgroup_ - << " with error " << strerror(errno); - int processes_cgroup_procs_fd = - open(processes_cgroup_procs_file_path.c_str(), O_RDWR); - ASSERT_NE(processes_cgroup_procs_fd, -1) - << "Failed to open processes cgroup's proc file at path " - << processes_cgroup_path_ << " with error " << strerror(errno); - - // According to man 5 proc, the maximum value of a pid on a 64-bit system is 2^22 - // i.e./4194304. fgets adds a null terminating character. - char read_buffer[16]; - - while (fgets(read_buffer, sizeof(read_buffer), leaf_cgroup_procs)) { - ssize_t bytes_written = - write(processes_cgroup_procs_fd, read_buffer, strlen(read_buffer)); - ASSERT_EQ(bytes_written, strlen(read_buffer)) - << "Failed to write to processes cgroups proc file at path " - << processes_cgroup_path_ << " with error " << strerror(errno); - } - ASSERT_EQ(ferror(leaf_cgroup_procs), 0) - << "Failed to read pid from leaf cgroups proc file at path " << leaf_cgroup_ - << " with error " << strerror(errno); - - fclose(leaf_cgroup_procs); - close(processes_cgroup_procs_fd); - } -}; - -TEST_F(SysFsCgroupDriverIntegrationTest, - FixtureCreatesCgroupsAndMigratesProcessToLeafCgroup) { - ASSERT_TRUE(std::filesystem::exists(testing_cgroup_->GetPath())) - << "Test fixture did not initialize the base cgroup at " << testing_cgroup_; - ASSERT_TRUE(std::filesystem::exists(leaf_cgroup_->GetPath())) - << "Test fixture did not initialize the leaf cgroup at " << leaf_cgroup_; - std::ifstream curr_proc_group_file("/proc/" + std::to_string(getpid()) + "/cgroup"); - std::string cgroup_path; - ASSERT_NE(curr_proc_group_file.peek(), EOF) - << "The current process is not running in any cgroup. This should never happen."; - std::getline(curr_proc_group_file, cgroup_path); - ASSERT_TRUE(curr_proc_group_file.good()); - // Extract the cgroup from the path. - std::string cgroup = cgroup_path.substr(cgroup_path.find_last_of('/') + 1); - ASSERT_EQ(cgroup, leaf_cgroup_->GetName()); -} - -namespace ray { - -TEST_F(SysFsCgroupDriverIntegrationTest, - CheckCgroupv2EnabledSucceedsIfOnlyCgroupv2Mounted) { - SysFsCgroupDriver driver; - Status s = driver.CheckCgroupv2Enabled(); - ASSERT_TRUE(s.ok()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - CheckCgroupFailsIfCgroupv2PathButNoReadPermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), 0000); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.CheckCgroup(cgroup_dir->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - CheckCgroupFailsIfCgroupv2PathButNoWritePermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.CheckCgroup(cgroup_dir->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - CheckCgroupFailsIfCgroupv2PathButNoExecPermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.CheckCgroup(cgroup_dir->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - CheckCgroupSucceedsIfCgroupv2PathAndReadWriteExecPermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.CheckCgroup(cgroup_dir->GetPath()); - EXPECT_TRUE(s.ok()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAlreadyExists) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.CreateCgroup(cgroup_dir->GetPath()); - ASSERT_TRUE(s.IsAlreadyExists()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfAncestorCgroupDoesNotExist) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - std::string non_existent_path = cgroup_dir->GetPath() + - std::filesystem::path::preferred_separator + "no" + - std::filesystem::path::preferred_separator + "bueno"; - Status s = driver.CreateCgroup(non_existent_path); - EXPECT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadPermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - std::string child_cgroup_path = - cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; - Status s = driver.CreateCgroup(child_cgroup_path); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, CreateCgroupFailsIfOnlyReadWritePermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - std::string child_cgroup_path = - cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; - Status s = driver.CreateCgroup(child_cgroup_path); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - CreateCgroupSucceedsIfParentExistsAndReadWriteExecPermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - std::string child_cgroup_path = - cgroup_dir->GetPath() + std::filesystem::path::preferred_separator + "child"; - Status s = driver.CreateCgroup(child_cgroup_path); - EXPECT_TRUE(s.ok()) << s.ToString(); - Status check_status = driver.CheckCgroup(child_cgroup_path); - EXPECT_TRUE(check_status.ok()) << check_status.ToString(); - ASSERT_EQ(rmdir(child_cgroup_path.c_str()), 0) - << "Failed to cleanup test cgroup at path " << child_cgroup_path << ".\n" - << "Error: " << strerror(errno); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - GetAvailableControllersFailsIfCgroupDoesNotExist) { - std::string non_existent_path = testing_cgroup_->GetPath() + - std::filesystem::path::preferred_separator + "no" + - std::filesystem::path::preferred_separator + "bueno"; - SysFsCgroupDriver driver; - StatusOr> s = - driver.GetAvailableControllers(non_existent_path); - EXPECT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - GetAvailableControllersFailsIfReadWriteButNotExecutePermissions) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - std::unique_ptr cgroup_dir = - std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - StatusOr> s = - driver.GetAvailableControllers(cgroup_dir->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - GetAvailableControllersSucceedsWithCPUAndMemoryControllersOnBaseCgroup) { - SysFsCgroupDriver driver; - StatusOr> s = - driver.GetAvailableControllers(testing_cgroup_->GetPath()); - EXPECT_TRUE(s.ok()) << s.ToString(); - std::unordered_set controllers = std::move(s.value()); - EXPECT_TRUE(controllers.find("cpu") != controllers.end()) - << "Cgroup integration tests expect the base cgroup at " << testing_cgroup_ - << " has the cpu controller available"; -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - GetAvailableControllersSucceedsWithNoAvailableControllers) { - auto parent_cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); - std::unique_ptr parent_cgroup = - std::move(parent_cgroup_dir_or_status.value()); - auto child_cgroup_dir_or_status = - TempCgroupDirectory::Create(parent_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); - std::unique_ptr child_cgroup = - std::move(child_cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - StatusOr> s = - driver.GetAvailableControllers(child_cgroup->GetPath()); - EXPECT_TRUE(s.ok()) << s.ToString(); - std::unordered_set controllers = std::move(s.value()); - EXPECT_EQ(controllers.size(), 0); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfSourceDoesntExist) { - auto ancestor_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); - auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); - auto dest_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); - auto dest_cgroup = std::move(dest_cgroup_or_status.value()); - // Do not create the source cgroup - std::string non_existent_path = - ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(non_existent_path, dest_cgroup->GetPath()); - EXPECT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, MoveAllProcessesFailsIfDestDoesntExist) { - auto ancestor_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); - auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); - auto source_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); - auto source_cgroup = std::move(source_cgroup_or_status.value()); - // Do not create the dest cgroup. - std::string non_existent_path = - ancestor_cgroup->GetPath() + std::filesystem::path::preferred_separator + "nope"; - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), non_existent_path); - EXPECT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - MoveAllProcessesFailsIfNotReadWriteExecPermissionsForSource) { - auto ancestor_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); - auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); - auto source_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); - auto source_cgroup = std::move(source_cgroup_or_status.value()); - auto dest_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); - auto dest_cgroup = std::move(dest_cgroup_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - MoveAllProcessesFailsIfNotReadWriteExecPermissionsForDest) { - auto ancestor_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); - auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); - auto source_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); - auto source_cgroup = std::move(source_cgroup_or_status.value()); - auto dest_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); - auto dest_cgroup = std::move(dest_cgroup_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - MoveAllProcessesFailsIfNotReadWriteExecPermissionsForAncestor) { - auto ancestor_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(ancestor_cgroup_or_status.ok()) << ancestor_cgroup_or_status.ToString(); - auto ancestor_cgroup = std::move(ancestor_cgroup_or_status.value()); - auto source_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); - auto source_cgroup = std::move(source_cgroup_or_status.value()); - auto dest_cgroup_or_status = - TempCgroupDirectory::Create(ancestor_cgroup->GetPath(), S_IRWXU); - ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); - auto dest_cgroup = std::move(dest_cgroup_or_status.value()); - ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRUSR), 0) - << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() - << "\n Error: " << strerror(errno); - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); - EXPECT_TRUE(s.IsPermissionDenied()) << s.ToString(); - // Change the permissions back read, write, and execute so cgroup can be deleted. - ASSERT_EQ(chmod(ancestor_cgroup->GetPath().c_str(), S_IRWXU), 0) - << "Failed to chmod cgroup directory " << ancestor_cgroup->GetPath() - << "\n Error: " << strerror(errno); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - MoveAllProcessesSucceedsWithCorrectPermissionsAndValidCgroups) { - auto source_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(source_cgroup_or_status.ok()) << source_cgroup_or_status.ToString(); - auto source_cgroup = std::move(source_cgroup_or_status.value()); - auto dest_cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(dest_cgroup_or_status.ok()) << dest_cgroup_or_status.ToString(); - auto dest_cgroup = std::move(dest_cgroup_or_status.value()); - StatusOr> child_process_s = - StartChildProcessInCgroup(source_cgroup->GetPath()); - ASSERT_TRUE(child_process_s.ok()) << child_process_s.ToString(); - auto [child_pid, child_pidfd] = child_process_s.value(); - SysFsCgroupDriver driver; - Status s = driver.MoveAllProcesses(source_cgroup->GetPath(), dest_cgroup->GetPath()); - ASSERT_TRUE(s.ok()) << s.ToString(); - // Assert that the child's pid is actually in the new file. - std::string dest_cgroup_procs_file_path = dest_cgroup->GetPath() + - std::filesystem::path::preferred_separator + - "cgroup.procs"; - std::ifstream dest_cgroup_procs_file(dest_cgroup_procs_file_path); - ASSERT_TRUE(dest_cgroup_procs_file.is_open()) - << "Could not open file " << dest_cgroup_procs_file_path << "."; - std::unordered_set dest_cgroup_pids; - int pid = -1; - while (dest_cgroup_procs_file >> pid) { - ASSERT_FALSE(dest_cgroup_procs_file.fail()) - << "Unable to read pid from file " << dest_cgroup_procs_file_path; - dest_cgroup_pids.emplace(pid); - } - EXPECT_EQ(dest_cgroup_pids.size(), 1); - EXPECT_TRUE(dest_cgroup_pids.find(child_pid) != dest_cgroup_pids.end()); - Status terminate_s = - TerminateChildProcessAndWaitForTimeout(child_pid, child_pidfd, 5000); - ASSERT_TRUE(terminate_s.ok()) << terminate_s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - EnableControllerFailsIfReadOnlyPermissionsForCgroup) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - EnableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.EnableController(cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, EnableControllerFailsIfCgroupDoesNotExist) { - std::string non_existent_path = - testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; - SysFsCgroupDriver driver; - Status s = driver.EnableController(non_existent_path, "memory"); - ASSERT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - EnableControllerFailsIfControllerNotAvailableForCgroup) { - // This will inherit controllers available because testing_cgroup_ has - // CPU and Memory controllers available. - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - auto nested_cgroup_dir_or_status = - TempCgroupDirectory::Create(cgroup_dir->GetPath(), S_IRWXU); - ASSERT_TRUE(nested_cgroup_dir_or_status.ok()) << nested_cgroup_dir_or_status.ToString(); - auto nested_cgroup_dir = std::move(nested_cgroup_dir_or_status.value()); - // Make sure that the cgroup has 0 available controllers. - // TODO(irabbani): I think it's okay to call the GetAvailableControllers function - // from here. - SysFsCgroupDriver driver; - auto available_controllers_s = - driver.GetAvailableControllers(nested_cgroup_dir->GetPath()); - ASSERT_TRUE(available_controllers_s.ok()) << available_controllers_s.ToString(); - auto available_controllers = std::move(available_controllers_s.value()); - ASSERT_EQ(available_controllers.size(), 0); - Status s = driver.EnableController(nested_cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfControllerNotEnabled) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - auto enabled_controllers_s = driver.GetEnabledControllers(cgroup_dir->GetPath()); - ASSERT_TRUE(enabled_controllers_s.ok()) << enabled_controllers_s.ToString(); - auto enabled_controllers = std::move(enabled_controllers_s.value()); - ASSERT_EQ(enabled_controllers.size(), 0); - Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - DisableControllerFailsIfReadOnlyPermissionsForCgroup) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - DisableControllerFailsIfReadWriteOnlyPermissionsForCgroup) { - auto cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_dir_or_status.ok()) << cgroup_dir_or_status.ToString(); - auto cgroup_dir = std::move(cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.DisableController(cgroup_dir->GetPath(), "memory"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, DisableControllerFailsIfCgroupDoesNotExist) { - std::string non_existent_path = - testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; - SysFsCgroupDriver driver; - Status s = driver.DisableController(non_existent_path, "memory"); - ASSERT_TRUE(s.IsNotFound()) << s.ToString(); -} - -//// /sys/fs/cgroup/testing (2 available, enable cpu controller) -//// enable one in /sys/fs/cgroup/testing/inner (enable cpu controller fails, then -/// succeeds) / enable one in /sys/fs/cgroup/testing/inner / disable one in -////sys/fs/cgroup/testing -TEST_F(SysFsCgroupDriverIntegrationTest, - EnableAndDisableControllerSucceedWithCorrectInputAndPermissions) { - auto parent_cgroup_dir_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(parent_cgroup_dir_or_status.ok()) << parent_cgroup_dir_or_status.ToString(); - auto parent_cgroup_dir = std::move(parent_cgroup_dir_or_status.value()); - auto child_cgroup_dir_or_status = - TempCgroupDirectory::Create(parent_cgroup_dir->GetPath(), S_IRWXU); - ASSERT_TRUE(child_cgroup_dir_or_status.ok()) << child_cgroup_dir_or_status.ToString(); - auto child_cgroup_dir = std::move(child_cgroup_dir_or_status.value()); - SysFsCgroupDriver driver; - - // There should be no enabled controllers on the parent cgroup so enabling the memory - // controller should fail. - Status invalid_argument_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); - ASSERT_TRUE(invalid_argument_s.IsInvalidArgument()) << invalid_argument_s.ToString(); - - // Enable the controller on the parent cgroup to make it available on the child cgroup. - Status enable_parent_s = driver.EnableController(parent_cgroup_dir->GetPath(), "cpu"); - ASSERT_TRUE(enable_parent_s.ok()) << enable_parent_s.ToString(); - - // Enable the controller on the child cgroup. - Status enable_child_s = driver.EnableController(child_cgroup_dir->GetPath(), "cpu"); - ASSERT_TRUE(enable_child_s.ok()) << enable_child_s.ToString(); - - // Cannot disable the controller on the parent cgroup while the child cgroup - // still has it enabled. - Status disable_parent_failure_s = - driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); - ASSERT_FALSE(disable_parent_failure_s.ok()) << enable_parent_s.ToString(); - // Disable the controller on the child cgroup. - Status disable_child_s = driver.DisableController(child_cgroup_dir->GetPath(), "cpu"); - ASSERT_TRUE(disable_child_s.ok()) << disable_child_s.ToString(); - // Can now disable the controller on the parent cgroup. - Status disable_parent_success_s = - driver.DisableController(parent_cgroup_dir->GetPath(), "cpu"); - ASSERT_TRUE(disable_parent_success_s.ok()) << disable_parent_success_s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintFailsIfCgroupDoesntExist) { - std::string non_existent_path = - testing_cgroup_->GetPath() + std::filesystem::path::preferred_separator + "nope"; - SysFsCgroupDriver driver; - Status s = driver.AddConstraint(non_existent_path, "memory.min", "1"); - ASSERT_TRUE(s.IsNotFound()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - AddResourceConstraintFailsIfReadOnlyPermissions) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - AddResourceConstraintFailsIfReadWriteOnlyPermissions) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRUSR | S_IWUSR); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); - ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, - AddResourceConstraintFailsIfConstraintNotSupported) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - // "memory.max" is not supported. - Status s = driver.AddConstraint(cgroup->GetPath(), "memory.max", "1"); - ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); -} -TEST_F(SysFsCgroupDriverIntegrationTest, - AddResourceConstraintFailsIfControllerNotEnabled) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - // Memory controller is not enabled. - Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1"); - ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); -} -TEST_F(SysFsCgroupDriverIntegrationTest, - AddResourceConstraintFailsIfInvalidConstraintValue) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - // Enable the cpu controller first. - Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu"); - ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString(); - // cpu.weight must be between [1,10000] - Status s_too_low = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "0"); - ASSERT_TRUE(s_too_low.IsInvalidArgument()) << s_too_low.ToString(); - Status s_too_high = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "10001"); - ASSERT_TRUE(s_too_high.IsInvalidArgument()) << s_too_high.ToString(); -} - -TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) { - auto cgroup_or_status = - TempCgroupDirectory::Create(testing_cgroup_->GetPath(), S_IRWXU); - ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString(); - auto cgroup = std::move(cgroup_or_status.value()); - SysFsCgroupDriver driver; - // Enable the cpu controller first. - Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu"); - ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString(); - // cpu.weight must be between [1,10000] - Status s = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "500"); - ASSERT_TRUE(s.ok()) << s.ToString(); -} -} // namespace ray From f4e0cb261c7ec96ae0af323e22ac4b4afc961afa Mon Sep 17 00:00:00 2001 From: irabbani Date: Wed, 30 Jul 2025 18:14:09 +0000 Subject: [PATCH 15/20] final cleanup Signed-off-by: irabbani --- .buildkite/core.rayci.yml | 10 -- src/ray/common/cgroup2/BUILD | 2 - .../common/cgroup2/cgroup_driver_interface.h | 2 +- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 5 +- src/ray/common/cgroup2/sysfs_cgroup_driver.h | 155 ++++++++++++------ 5 files changed, 108 insertions(+), 66 deletions(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index ece626c6f328..c820567a46ec 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -305,16 +305,6 @@ steps: - "3.13" # cpp tests - - label: ":ray: core: cgroupv2 tests" - tags: core_cpp - instance_type: medium - commands: - - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --only-tags=cgroupv2 - --cache-test-results - --build-name oss-ci-base_test - --build-type cgroup - --privileged - - label: ":ray: core: cgroup tests" tags: core_cpp instance_type: medium diff --git a/src/ray/common/cgroup2/BUILD b/src/ray/common/cgroup2/BUILD index 8b61b1f1110a..082f252f9e4c 100644 --- a/src/ray/common/cgroup2/BUILD +++ b/src/ray/common/cgroup2/BUILD @@ -23,7 +23,5 @@ ray_cc_library( "//src/ray/common:status_or", "//src/ray/util:logging", "@com_google_absl//absl/strings", - # "@com_google_absl//absl/strings:str_format", - # "@com_google_absl//absl/strings:str_join", ], ) diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index 0a576bbcf039..aad1c6472942 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -23,7 +23,7 @@ namespace ray { /** - A utility that allows the caller to check if cgroupv2 is mounted correctly + A utility that can be used to check if cgroupv2 is mounted correctly and perform cgroup operations on the system. It supports the memory and cpu controllers with the memory.min and cpu.weight constraints respectively. diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index 752589943222..f0fcf28ba058 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -71,7 +71,7 @@ Status SysFsCgroupDriver::CheckCgroupv2Enabled() { if (found_cgroupv1 && found_cgroupv2) { return Status::Invalid("Cgroupv1 and cgroupv2 are both mounted. Unmount cgroupv1."); } else if (found_cgroupv1 && !found_cgroupv2) { - // TODO(irabbani): provide a link to the ray documentation once it's been written + // TODO(#54703): provide a link to the ray documentation once it's been written // for how to troubleshoot these. return Status::Invalid( "Cgroupv1 is mounted and cgroupv2 is not mounted. " @@ -301,7 +301,6 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, const std::string &constraint_value) { RAY_RETURN_NOT_OK(CheckCgroup(cgroup)); auto constraint_it = supported_constraints_.find(constraint); - // Return InvalidArgument if constraint is not enabled. if (constraint_it == supported_constraints_.end()) { std::string supported_constraint_names("["); for (auto it = supported_constraints_.begin(); it != supported_constraints_.end(); @@ -355,8 +354,6 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, int fd = open(file_path.c_str(), O_RDWR); - // TODO(irabbani): If meaningful, the errno can be used to disambiguate different - // errors if they're appropriate. if (fd == -1) { return Status::InvalidArgument( absl::StrFormat("Failed to apply %s=%s to cgroup %s.\n" diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index 353220b7469e..d8cb86454f09 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -58,12 +58,6 @@ class SysFsCgroupDriver : public CgroupDriverInterface { SysFsCgroupDriver &operator=(const SysFsCgroupDriver &other) = delete; SysFsCgroupDriver &operator=(const SysFsCgroupDriver &&other) = delete; - // The recommended way to mount cgroupv2 only and disable cgroupv1. This will - // prevent controllers from being migrated between the two. This is what systemd - // does and recommends. See the following for more information: - // https://github.com/systemd/systemd/blob/main/docs/CGROUP_DELEGATION.md#hierarchy-and-controller-support - // https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support - /** The recommended way to mount cgroupv2 is with cgroupv1 disabled. This prevents cgroup controllers from being migrated between the two modes. This follows @@ -83,24 +77,27 @@ class SysFsCgroupDriver : public CgroupDriverInterface { cgroup /sys/fs/cgroup cgroup rw,nosuid,nodev,noexec,relatime,nsdelegate cgroup2 /sys/fs/cgroup/unified/ cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate - @return OK if no errors are encounted, otherwise Invalid. - */ + @return OK if no errors + @return Status::Invalid if cgroupv2 is not enabled correctly. + */ Status CheckCgroupv2Enabled() override; /** Checks to see if the cgroup_path is mounted in the cgroupv2 filesystem and that the current process has read, write, and execute permissions for - the directory. + the directory. Uses the CGROUP_SUPER_MAGIC to detect that the filesystem + is mounted as cgroupv2. - @param cgroup_path the path of a cgroup directory (e.g. /sys/fs/cgroup/ray) - - @return OK if no errors are encounted. Otherwise, one of the following errors - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - InvalidArgument if the cgroup is not using cgroupv2. + @param cgroup_path the path of a cgroup directory. @see The kernel documentation for CGROUP2_SUPER_MAGIC https://www.kernel.org/doc/html/v5.4/admin-guide/cgroup-v2.html#mounting + + @return Status::OK if no errors are encounted. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2. */ Status CheckCgroup(const std::string &cgroup_path) override; @@ -114,88 +111,148 @@ class SysFsCgroupDriver : public CgroupDriverInterface { @param cgroup_path the absolute path of the cgroup directory to create. - @return OK if no errors are encounted. Otherwise, one of the following errors - NotFound if an ancestor cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - AlreadyExists if the cgroup already exists. + @return Status::OK if no errors are encounted. + @return Status::NotFound if an ancestor cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::AlreadyExists if the cgroup already exists. */ Status CreateCgroup(const std::string &cgroup_path) override; /** - - Checks to see if cgroup_dir is a valid cgroup, @see SysFsCgroupDriver::CheckCgroup, - and returns an appropriate error if not. - - Parses a cgroup.controllers file which has a space separated list of all controllers + Parses the cgroup.controllers file which has a space separated list of all controllers available to the cgroup. @see For details of the cgroup.controllers file https://docs.kernel.org/admin-guide/cgroup-v2.html#enabling-and-disabling. @param cgroup_path absolute path of the cgroup. - @returns OK with a set of controllers if successful, otherwise one of following - NotFound if the cgroup does not exist. - PermissionDenied if current user doesn't have read, write, and execute permissions. - InvalidArgument if the cgroup is not using cgroupv2 or malformed controllers file. + @return Status::OK with a set of controllers if successful. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2 or malformed + controllers file. */ StatusOr> GetAvailableControllers( const std::string &cgroup_dir) override; - // + /** + Parses the cgroup.subtree_control file which has a space separated list of all + controllers enabled in the cgroup. + + @see For details of the cgroup.subtree_control file + https://docs.kernel.org/admin-guide/cgroup-v2.html#enabling-and-disabling. + + @param cgroup_path absolute path of the cgroup. + @return Status::OK with a set of controllers if successful. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2 or if the + cgroup.subtree_control is malformed. + */ StatusOr> GetEnabledControllers( const std::string &cgroup_dir) override; - // to - - // from - - // Fails if to doesn't exist and from doesn't exist /** - Reads the cgroup.procs of from and writes them out to the given file. + Reads the cgroup.procs of "from" and writes them out to the given file. The cgroup.procs file is newline seperated. The current user must have read-write permissions to both cgroup.procs file as well as the common ancestor of the source and destination cgroups. @see The cgroup.procs section for more information https://docs.kernel.org/admin-guide/cgroup-v2.html#core-interface-files - @return InvalidArgument if process files could not be opened, read from, or written to - correctly, ok otherwise. + + @return Status::OK with if successful. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2. + @return Status::Invalid if files could not be opened, read from, or written to + correctly. */ Status MoveAllProcesses(const std::string &from, const std::string &to) override; /** - The cgroup.subtree_control file has the list of all currently enabled - controllers for the cgroup. There are two important caveats: + Enables a controller by writing to the cgroup.subtree_control file. This can + only happen if - 1. The no internal process constraint - 2. The controller needs to be enabled through the entiree path to thee cgroup - constraint. + 1. The controller is not enabled in the parent see cgroup. + 2. The cgroup is not a leaf node i.e. it has children. This is called the no internal + process constraint - @param cgroup_path - @param controller + @see the cgroup documentation for the cgroup.subtree_control file + https://docs.kernel.org/admin-guide/cgroup-v2.html#controlling-controllers + + @param cgroup_path absolute path of the cgroup. + @param controller name of the controller i.e. "cpu" or "memory" from + @ref CgroupDriverInterface::supported_controllers_ "supported controllers". + + @return Status::OK if successful + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2, if the controller + is not available i.e not enabled on the parent. + @return Status::Invalid if cannot open or write to cgroup.subtree_control. */ Status EnableController(const std::string &cgroup_path, const std::string &controller) override; + /** + Disables a controller by writing to the cgroup.subtree_control file. This can + only happen if the controller is not enabled in child cgroups. + + @see the cgroup documentation for the cgroup.subtree_control file + https://docs.kernel.org/admin-guide/cgroup-v2.html#controlling-controllers + + @param cgroup_path absolute path of the cgroup. + @param controller name of the controller i.e. "cpu" or "memory" from + @ref CgroupDriverInterface::supported_controllers_ "supported controllers". + + @return Status::OK if successful. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2, if the controller + is not available i.e not enabled on the parent. + @return Status::Invalid if cannot open or write to cgroup.subtree_control. + */ Status DisableController(const std::string &cgroup_path, const std::string &controller) override; /** - @returns - InvalidArgument if the cgroup is not valid, or if the constraint file doesn't - exist. - InvalidArgument if the constraint is not supported. - + Adds a constraint to the respective cgroup file. See + @ref CgroupDriverInterface::supported_constraints_ "supported constraints" and valid + values. + + @return Status::OK if no errors are encounted. + @return Status::NotFound if the cgroup does not exist. + @return Status::PermissionDenied if current user doesn't have read, write, and execute + permissions. + @return Status::InvalidArgument if the cgroup is not using cgroupv2, the constraint + is not supported in ray, the constraint value is out of range, or if cannot write + to the relevant constraint file. */ Status AddConstraint(const std::string &cgroup, const std::string &constraint, const std::string &constraint_value) override; private: - // cgroup.subtree_control and cgroup.controllers both have the same format. - // assumes caller checks the validity of the cgroup + /** + @param controller_file_path the absolute path of the controller file to read which is + one of cgroup.subtree_control or cgroup.controllers. + + @return Status::OK with a list of controllers in the file. + @return Status::InvalidArgument if failed to read file or file was malformed. + */ StatusOr> ReadControllerFile( const std::string &controller_file_path); + // Used for unit testing through the constructor. std::string mount_file_path_; + static constexpr std::string_view kCgroupProcsFilename = "cgroup.procs"; static constexpr std::string_view kCgroupSubtreeControlFilename = "cgroup.subtree_control"; From 544ba83d84ce67e25527fa9966f368b47a1daa11 Mon Sep 17 00:00:00 2001 From: irabbani Date: Wed, 30 Jul 2025 18:17:59 +0000 Subject: [PATCH 16/20] iwyu Signed-off-by: irabbani --- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index f0fcf28ba058..cd4576ea8740 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -24,7 +24,6 @@ #include #include -#include #include #include #include From 2e341d689307a46025aa6968cfa1cfda79de9376 Mon Sep 17 00:00:00 2001 From: irabbani Date: Wed, 30 Jul 2025 18:31:50 +0000 Subject: [PATCH 17/20] we cpplintin! Signed-off-by: irabbani --- .pre-commit-config.yaml | 2 +- src/ray/common/cgroup2/cgroup_driver_interface.h | 2 ++ src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 2 ++ src/ray/common/cgroup2/sysfs_cgroup_driver.h | 2 ++ src/ray/common/cgroup2/test/cgroup_test_utils.h | 3 ++- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5b924789d872..69ef98aaef03 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -73,7 +73,7 @@ repos: hooks: - id: cpplint args: ["--filter=-whitespace/braces,-whitespace/line_length,-build/c++11,-build/c++14,-build/c++17,-readability/braces,-whitespace/indent_namespace,-runtime/int,-runtime/references,-build/include_order"] - files: ^src/ray/(common/ray_syncer|util|raylet_client|internal|scheduling|pubsub|object_manager|rpc(?:/.*)?|raylet|core_worker)/.*\.(h|cc)$ + files: ^src/ray/(common/cgroup2|common/ray_syncer|util|raylet_client|internal|scheduling|pubsub|object_manager|rpc(?:/.*)?|raylet|core_worker)/.*\.(h|cc)$ exclude: | (?x)^( src/ray/raylet/scheduling/.*\.(h|cc)$ | diff --git a/src/ray/common/cgroup2/cgroup_driver_interface.h b/src/ray/common/cgroup2/cgroup_driver_interface.h index aad1c6472942..132000c79ff5 100644 --- a/src/ray/common/cgroup2/cgroup_driver_interface.h +++ b/src/ray/common/cgroup2/cgroup_driver_interface.h @@ -13,9 +13,11 @@ // limitations under the License. #pragma once +#include #include #include #include +#include #include "ray/common/status.h" #include "ray/common/status_or.h" diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index cd4576ea8740..a4c6e9a34063 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include "absl/strings/str_format.h" #include "absl/strings/str_join.h" diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.h b/src/ray/common/cgroup2/sysfs_cgroup_driver.h index d8cb86454f09..fd56d129617b 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.h +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.h @@ -11,12 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include #include #include +#include #include "ray/common/cgroup2/cgroup_driver_interface.h" #include "ray/common/status.h" diff --git a/src/ray/common/cgroup2/test/cgroup_test_utils.h b/src/ray/common/cgroup2/test/cgroup_test_utils.h index 222ed385160b..f1622d413573 100644 --- a/src/ray/common/cgroup2/test/cgroup_test_utils.h +++ b/src/ray/common/cgroup2/test/cgroup_test_utils.h @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include @@ -29,7 +30,7 @@ class TempDirectory { public: static ray::StatusOr> Create(); - TempDirectory(std::string &&path) : path_(path) {} + explicit TempDirectory(std::string &&path) : path_(path) {} TempDirectory(const TempDirectory &) = delete; TempDirectory(TempDirectory &&) = delete; From 9e46ce6c844d7ab1664fc2350fefa052e8b4b5bf Mon Sep 17 00:00:00 2001 From: Ibrahim Rabbani Date: Wed, 30 Jul 2025 11:32:44 -0700 Subject: [PATCH 18/20] Update src/ray/common/cgroup2/sysfs_cgroup_driver.cc Co-authored-by: Edward Oakes Signed-off-by: Ibrahim Rabbani --- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index cd4576ea8740..d35e8001d6f7 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -96,7 +96,7 @@ Status SysFsCgroupDriver::CheckCgroup(const std::string &cgroup_path) { strerror(errno))); } return Status::InvalidArgument( - absl::StrFormat("Failed to stat cgroup directory at path %s. because of %s", + absl::StrFormat("Failed to stat cgroup directory at path %s because of %s", cgroup_path, strerror(errno))); } From 7c745c53afc7ac785dcdca85a4d1eb6393578b11 Mon Sep 17 00:00:00 2001 From: Ibrahim Rabbani Date: Wed, 30 Jul 2025 11:33:45 -0700 Subject: [PATCH 19/20] Apply suggestions from code review Co-authored-by: Edward Oakes Signed-off-by: Ibrahim Rabbani --- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index d35e8001d6f7..0bb3eeb98b3a 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -109,7 +109,7 @@ Status SysFsCgroupDriver::CheckCgroup(const std::string &cgroup_path) { cgroup_path)); } - // Note: the process needs execute permissions for the cgroup directory + // NOTE: the process needs execute permissions for the cgroup directory // to traverse the filesystem. if (access(cgroup_path.c_str(), R_OK | W_OK | X_OK) == -1) { return Status::PermissionDenied( @@ -204,7 +204,7 @@ Status SysFsCgroupDriver::MoveAllProcesses(const std::string &from, while (in_file >> pid) { if (in_file.fail()) { return Status::Invalid(absl::StrFormat( - "Could not read pid from cgroup procs file %s", from_procs_file_path)); + "Could not read PID from cgroup procs file %s", from_procs_file_path)); } out_file << pid; out_file.flush(); From d7eb86339909d709832f1c5fb40ba25985dc664b Mon Sep 17 00:00:00 2001 From: irabbani Date: Wed, 30 Jul 2025 18:35:30 +0000 Subject: [PATCH 20/20] bug Signed-off-by: irabbani --- src/ray/common/cgroup2/sysfs_cgroup_driver.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc index a4c6e9a34063..b3d92d78b4b4 100644 --- a/src/ray/common/cgroup2/sysfs_cgroup_driver.cc +++ b/src/ray/common/cgroup2/sysfs_cgroup_driver.cc @@ -368,6 +368,7 @@ Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup, ssize_t bytes_written = write(fd, constraint_value.c_str(), constraint_value.size()); if (bytes_written != static_cast(constraint_value.size())) { + close(fd); return Status::InvalidArgument( absl::StrFormat("Failed to apply %s=%s to cgroup %s.\n" "Error: %s",