Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
05c4dbc
[core] (cgroups 1/n) Adding a sys/fs filesystem driver
israbbani Jul 24, 2025
645f9a0
adding the copyright
israbbani Jul 24, 2025
2bb2c5b
Adding a fallback for creating processes inside cgroups with fork/exec
israbbani Jul 24, 2025
4793094
adding a pause in the tests to see what's up with the container
israbbani Jul 25, 2025
85d0ebf
Update src/ray/common/cgroup2/cgroup_driver_interface.h
israbbani Jul 25, 2025
3a5a020
Comments
israbbani Jul 25, 2025
68b0c93
Merge branch 'irabbani/cgroups-1' of github.com:ray-project/ray into …
israbbani Jul 25, 2025
f52354b
Putting the cgroupv2 tests into a separate target
israbbani Jul 29, 2025
148d04d
removing test sleep
israbbani Jul 29, 2025
d3f8b79
Removing a docstring
israbbani Jul 29, 2025
d76ff15
enabling CI tests
israbbani Jul 29, 2025
2798ea5
fixing absl imports
israbbani Jul 29, 2025
3fda505
commenting local
israbbani Jul 29, 2025
9e1e931
doxygen formatting
israbbani Jul 29, 2025
f066f34
Merge branch 'master' into irabbani/cgroups-1
israbbani Jul 30, 2025
e6b4926
removing integration tests
israbbani Jul 30, 2025
f4e0cb2
final cleanup
israbbani Jul 30, 2025
544ba83
iwyu
israbbani Jul 30, 2025
669ba99
Merge branch 'master' into irabbani/cgroups-1
israbbani Jul 30, 2025
2e341d6
we cpplintin!
israbbani Jul 30, 2025
9e46ce6
Update src/ray/common/cgroup2/sysfs_cgroup_driver.cc
israbbani Jul 30, 2025
7c745c5
Apply suggestions from code review
israbbani Jul 30, 2025
d7eb863
bug
israbbani Jul 30, 2025
ff64534
Merge branch 'irabbani/cgroups-1' of github.com:ray-project/ray into …
israbbani Jul 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .buildkite/core.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,16 @@ steps:
- "3.13"

# cpp tests
- label: ":ray: core: cgroupv2 tests"
tags: core_cpp
instance_type: medium
commands:
- bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --only-tags=cgroupv2
--cache-test-results
--build-name oss-ci-base_test
--build-type cgroup
--privileged

- label: ":ray: core: cgroup tests"
tags: core_cpp
instance_type: medium
Expand Down
29 changes: 29 additions & 0 deletions src/ray/common/cgroup2/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
load("//bazel:ray.bzl", "ray_cc_library")

ray_cc_library(
name = "cgroup_driver_interface",
hdrs = [
"cgroup_driver_interface.h",
],
deps = [
"//src/ray/common:status",
"//src/ray/common:status_or",
],
)

ray_cc_library(
name = "sysfs_cgroup_driver",
srcs = ["sysfs_cgroup_driver.cc"],
hdrs = [
"sysfs_cgroup_driver.h",
],
deps = [
":cgroup_driver_interface",
"//src/ray/common:status",
"//src/ray/common:status_or",
"//src/ray/util:logging",
"@com_google_absl//absl/strings",
# "@com_google_absl//absl/strings:str_format",
# "@com_google_absl//absl/strings:str_join",
],
)
204 changes: 204 additions & 0 deletions src/ray/common/cgroup2/cgroup_driver_interface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// Copyright 2025 The Ray Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once

#include <string>
#include <unordered_map>
#include <unordered_set>

#include "ray/common/status.h"
#include "ray/common/status_or.h"

namespace ray {

/**
A utility that allows the caller to check if cgroupv2 is mounted correctly
and perform cgroup operations on the system. It supports the memory and cpu controllers
with the memory.min and cpu.weight constraints respectively.

@see The cgroupv2 documentation for more details:
https://docs.kernel.org/admin-guide/cgroup-v2.html
*/
class CgroupDriverInterface {
public:
virtual ~CgroupDriverInterface() = default;

/**
Checks to see if only cgroupv2 is enabled (known as unified mode) on the system.
If cgroupv2 is not enabled, or enabled along with cgroupv1, returns Invalid
with the appropriate error message.

@see systemd's documentation for more information about unified mode:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit(?): make sure doxygen understands how to deal with this (as the source url is on a new line)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It works surprisingly well! Here's a screenshot from the generated docs:
image

https://github.com/systemd/systemd/blob/main/docs/CGROUP_DELEGATION.md#hierarchy-and-controller-support

@see K8S documentation on how to enable cgroupv2 and check if it's enabled correctly:
https://kubernetes.io/docs/concepts/architecture/cgroups/#linux-distribution-cgroup-v2-support

@return Status::OK if successful,
@return Status::Invalid if cgroupv2 is not enabled correctly.
*/
virtual Status CheckCgroupv2Enabled() = 0;

/**
Checks that the cgroup is valid. See return values for details of which
invariants are checked.

@param cgroup the absolute path of the cgroup.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assuming this is some java docstring style?

we should keep it standard, either:

  1. follow what we currently have
  2. let's commit to migrating to this style if there's a strong benefit to it

(FWIW I find the style you have more readable)

Copy link
Contributor Author

@israbbani israbbani Jul 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the PR description to talk about this. The style is called doxygen. It can be used to automatically generate API docs for developers. I think it has a few advantages

  • I find it more readable what we currently have.
  • The automated API docs will be useful for public contributions and internal contributors to search through APIs. E.g. it can help answer questions like do we have a utility class that joins paths?
  • There are special tags that the docs generator can use e.g. @param, @tparam (for template params), @ref, @see etc. The whole list is here, but we can start with a few of these.

I'd prefer to change our current ones to these and use this as a standard moving forward.


@return Status::OK if no errors are encounted. Otherwise, one of the following errors
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::InvalidArgument if the cgroup is not using cgroupv2.
*/
virtual Status CheckCgroup(const std::string &cgroup) = 0;

/**
Creates a new cgroup at the specified path.
Expects all cgroups on the path from root -> the new cgroup to already exist.
Expects the user to have read, write, and execute privileges to parent cgroup.

@param cgroup is an absolute path to the cgroup

@return Status::OK if no errors are encounted. Otherwise, one of the following errors
@return Status::NotFound if an ancestor cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::AlreadyExists if the cgroup already exists.
*/
virtual Status CreateCgroup(const std::string &cgroup) = 0;

/**
Move all processes from one cgroup to another. The process must have read, write, and
execute permissions for both cgroups and their lowest common ancestor.

@see The relevant section of the cgroup documentation for more details:
https://docs.kernel.org/admin-guide/cgroup-v2.html#delegation-containment

@param from the absolute path of the cgroup to migrate processes out of.
@param to the absolute path of the cgroup to migrate processes into.

@return Status::OK if no errors are encounted. Otherwise, one of the following errors
@return Status::NotFound if to or from don't exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::Invalid if any errors occur while reading from and writing to
cgroups.
*/
virtual Status MoveAllProcesses(const std::string &from, const std::string &to) = 0;

/**
Enables an available controller on a cgroup. A controller can be enabled if the
1) controller is enabled in the parent of the cgroup.
2) cgroup has no children i.e. it's a leaf node.

@param cgroup is an absolute path to the cgroup.
@param controller is the name of the controller (e.g. "cpu" and not "+cpu")

@see No Internal Process Constraint for more details:
https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint

@return Status::OK if successful, otherwise one of the following
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions for the cgroup.
@return Status::InvalidArgument if the controller is not available or if cgroup is not
a cgroupv2.
@return Status::Invalid for all other failures.
*/
virtual Status EnableController(const std::string &cgroup,
const std::string &controller) = 0;

/**
Disables an enabled controller in a cgroup. A controller can be disabled if the
controller is not enabled on a child cgroup.

@param cgroup is an absolute path to the cgroup.
@param controller is the name of the controller (e.g. "cpu" and not "-cpu")

@return Status::OK if successful, otherwise one of the following
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions for the cgroup.
@return Status::InvalidArgument if the controller is not enabled
or if cgroup is not a cgroupv2. Status::Invalid for all other failures.
*/
virtual Status DisableController(const std::string &cgroup,
const std::string &controller) = 0;
/**
Adds a resource constraint to the cgroup. To add a constraint
1) the cgroup must have the relevant controller enabled e.g. memory.min cannot be
enabled if the memory controller is not enabled.
2) the constraint must be supported in Ray (@see supported_constraints_).
3) the constraint value must be in the correct range (@see supported_constraints_).

@param cgroup is an absolute path to the cgroup.
@param constraint the name of the constraint.
@param value the value of the constraint.

@return Status::OK if successful, otherwise one of the following
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions for the cgroup.
@return Status::InvalidArgument if the cgroup is not valid or constraint is not
supported or the value not correct.
*/
virtual Status AddConstraint(const std::string &cgroup,
const std::string &constraint,
const std::string &value) = 0;
/**
Returns a list of controllers that can be enabled on the given cgroup based on
what is enabled on the parent cgroup.

@param cgroup absolute path of the cgroup.

@return Status::OK with a set of controllers if successful, otherwise one of
following
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::InvalidArgument if the cgroup is not using cgroupv2 or malformed
controllers file.
*/
virtual StatusOr<std::unordered_set<std::string>> GetAvailableControllers(
const std::string &cgroup) = 0;

/**
Returns a list of controllers enabled on the cgroup.

@param cgroup absolute path of the cgroup.

@return Status::OK with a set of controllers if successful, otherwise one of following
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::InvalidArgument if the cgroup is not using cgroupv2 or malformed
controllers file.
*/
virtual StatusOr<std::unordered_set<std::string>> GetEnabledControllers(
const std::string &cgroup) = 0;

struct Constraint {
std::pair<size_t, size_t> range;
std::string controller;
};

protected:
const std::unordered_map<std::string, Constraint> supported_constraints_ = {
{"cpu.weight", {{1, 10000}, "cpu"}},
{"memory.min", {{0, std::numeric_limits<size_t>::max()}, "memory"}},
};
const std::unordered_set<std::string> supported_controllers_ = {"cpu", "memory"};
};
} // namespace ray
Loading