-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] (cgroups 14/n) Clean up bazel targets and support cross-platform build. #57244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
599f968
77f6b64
9fd1160
a5f4b5a
762b5cf
8d07f6f
b92677e
a870d5a
e34f19b
55f1ec8
4c7545e
aef6bd8
59366ce
44ab09e
0c8d8e3
6dc39ad
60d77bb
bfd2482
9174357
ab526bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| // Copyright 2025 The Ray Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
|
|
||
| #include "ray/common/cgroup2/cgroup_manager_interface.h" | ||
|
|
||
| namespace ray { | ||
|
|
||
| // TODO(54703): Refactor the configs into a struct called CgroupManagerConfig | ||
| // and delegate input validation and error messages to it. | ||
| class CgroupManagerFactory { | ||
| public: | ||
| /** | ||
|
|
||
| This feature is only enabled in Linux. If using Linux, validates inputs, creates the | ||
| ray's cgroup heirarchy, enables constraints, and moves all system processes into the | ||
israbbani marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| system cgroup. | ||
|
|
||
| On non-Linux platforms, this will return a noop implementation. | ||
|
|
||
| @param enable_resource_isolation if true, will create process isolation with using | ||
| cgroups (@see CgroupManager::Create for more information). | ||
| @param cgroup_path the cgroup that the process will take ownership of. | ||
| @param node_id used to create a unique cgroup subtree per running ray node. | ||
| @param system_reserved_cpu_weight a value between [1,10000] to assign to the cgroup | ||
| for system processes. The cgroup for application processes gets 10000 - | ||
| system_reserved_cpu_weight. | ||
| @param system_reserved_memory_bytes used to reserve memory for the system cgroup. | ||
| @param system_pids a comma-separated list of pids of ray system processes to move into | ||
| the system cgroup. | ||
|
|
||
| For more information about the parameters, see @ref CgroupManager::Create. | ||
|
|
||
| @note any of the following is undefined behavior and will cause a RAY_CHECK to fail | ||
| 1. enable_resource_isolation is true and either | ||
| a. cgroup_path is empty | ||
| b. system_reserved_cpu_weight or system_reserved_memory_bytes are -1. | ||
| 2. The CgroupManager's precondition checks fail | ||
| a. cgroupv2 is not mounted correctly in unified mode (see @ref | ||
| CgroupDriverInterface::CheckCgroupv2Enabled). | ||
| b. the current process does not adequate permissions (see @ref | ||
| CgroupManager::Create). | ||
| c. supported cgroup controllers are not available (see @ref | ||
| CgroupManager::supported_controllers_). | ||
| 3. if a process in system_pids cannot be moved into the system cgroup. | ||
| */ | ||
| static std::unique_ptr<CgroupManagerInterface> Create( | ||
| bool enable_resource_isolation, | ||
| std::string cgroup_path, | ||
| const std::string &node_id, | ||
| const int64_t system_reserved_cpu_weight, | ||
| const int64_t system_reserved_memory_bytes, | ||
| const std::string &system_pids); | ||
| }; | ||
| } // namespace ray | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| // Copyright 2025 The Ray Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| #include <sys/types.h> | ||
| #include <unistd.h> | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include "absl/strings/str_format.h" | ||
| #include "absl/strings/str_split.h" | ||
| #include "ray/common/cgroup2/cgroup_driver_interface.h" | ||
| #include "ray/common/cgroup2/cgroup_manager.h" | ||
| #include "ray/common/cgroup2/cgroup_manager_factory.h" | ||
| #include "ray/common/cgroup2/cgroup_manager_interface.h" | ||
| #include "ray/common/cgroup2/noop_cgroup_manager.h" | ||
| #include "ray/common/cgroup2/sysfs_cgroup_driver.h" | ||
|
|
||
| namespace ray { | ||
|
|
||
| std::unique_ptr<CgroupManagerInterface> CgroupManagerFactory::Create( | ||
| bool enable_resource_isolation, | ||
| std::string cgroup_path, | ||
| const std::string &node_id, | ||
| const int64_t system_reserved_cpu_weight, | ||
| const int64_t system_reserved_memory_bytes, | ||
| const std::string &system_pids) { | ||
| if (!enable_resource_isolation) { | ||
| return std::make_unique<NoopCgroupManager>(); | ||
| } | ||
|
|
||
| RAY_CHECK(!cgroup_path.empty()) | ||
| << "Failed to start CgroupManager. If enable_resource_isolation is set to true, " | ||
| "cgroup_path cannot be empty."; | ||
|
Comment on lines
+44
to
+46
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the general case, should we structure the factories so that they
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Capturing our discussion for posterity. If there's a panic or a fatal error, I would keep the actual RAY_CHECK as close to the error as possible. You're not worried about clean up and you can dump as much context as possible. I would return a Status is if there's a chance that the caller can either recover provide more useful information. The antidote to this sadness is to have as few FATAL errors as possible and most failures should be recoverable.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invariant checking on component startups or detecting misconfigurations is a valid use of RAY_CHECK. |
||
|
|
||
| RAY_CHECK_NE(system_reserved_cpu_weight, -1) | ||
| << "Failed to start CgroupManager. If enable_resource_isolation is set to true, " | ||
| "system_reserved_cpu_weight must be set to a value between [1,10000]"; | ||
|
|
||
| RAY_CHECK_NE(system_reserved_memory_bytes, -1) | ||
| << "Failed to start CgroupManager. If enable_resource_isolation is set to true, " | ||
| "system_reserved_memory_bytes must be set to a value > 0"; | ||
|
|
||
| StatusOr<std::unique_ptr<CgroupManagerInterface>> cgroup_manager_s = | ||
| CgroupManager::Create(cgroup_path, | ||
| node_id, | ||
| system_reserved_cpu_weight, | ||
| system_reserved_memory_bytes, | ||
| std::make_unique<SysFsCgroupDriver>()); | ||
|
|
||
| RAY_CHECK(cgroup_manager_s.ok()) << absl::StrFormat( | ||
| "Failed to start CgroupManager due to %s.", cgroup_manager_s.ToString()); | ||
|
|
||
| std::unique_ptr<CgroupManagerInterface> cgroup_manager = | ||
| std::move(cgroup_manager_s.value()); | ||
|
|
||
| std::vector<std::string> system_pids_to_move; | ||
| if (!system_pids.empty()) { | ||
| system_pids_to_move = std::move(absl::StrSplit(system_pids, ",")); | ||
| } | ||
|
|
||
| system_pids_to_move.emplace_back(std::to_string(getpid())); | ||
|
|
||
| for (const auto &pid : system_pids_to_move) { | ||
| RAY_CHECK_OK(cgroup_manager->AddProcessToSystemCgroup(pid)) | ||
| << absl::StrFormat("Failed to move process with pid %s into system cgroup.", pid); | ||
| } | ||
|
|
||
| return cgroup_manager; | ||
| } | ||
| } // namespace ray | ||
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah!