-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] (cgroups 11/n) Raylet will move system processes into cgroup on startup #56522
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 181 commits
05c4dbc
645f9a0
2bb2c5b
4793094
85d0ebf
3a5a020
68b0c93
f52354b
148d04d
d3f8b79
d76ff15
2798ea5
3fda505
9e1e931
f066f34
e6b4926
f4e0cb2
544ba83
669ba99
2e341d6
9e46ce6
7c745c5
d7eb863
ff64534
da4b475
37e205f
7b83932
b911d25
63506dc
e6f1ae9
ead9de1
d0bcf4d
08c36d8
758955a
e59ac62
8866592
5364a1d
c77e1f8
fe54541
67b21d5
6e6bc32
c399d45
2cb4f6e
dd25a97
4a95598
cc51788
d31eb1a
d43a5d3
a458406
01023b9
bb5d866
f4a8553
17d1008
3423eab
5357ea3
1ecfdda
17c07da
e044fcd
b59dbc4
13eee38
f698183
3b37051
946ec90
ca63baa
0fe9113
398ef88
ca83426
f7f04db
dfd9b07
1884da5
e0bbac8
2457558
36101f4
a145a81
fc85704
b5f6c5e
03c731e
6fc9652
4aeabf4
44a5844
4de334c
70ed06d
89f49e5
dd0bf98
fd39d7e
4e8e20c
9304014
eee8982
20bf0dd
699f3ba
58e0c1a
e2e957d
9e7a2ef
fd9ef0d
f86a010
29a7ae4
1c17e3f
a843dd4
43a180e
4c3469b
b577600
7ca430b
7583fae
a3164a4
b1d8f39
299040a
b80da98
4a581d7
df8f925
0059039
9342ed5
5b89821
4af9d3f
e386cb5
74612b0
dcf558b
32a96f2
72a2796
1f67fd1
69125b9
247bc40
9b759cd
04cf359
1aa14ea
7b0518b
1e9b214
f847c13
e0bc7ae
98d6dcd
06a9e6d
cc3af83
161dd95
0db3fa5
a64f258
323d0c7
e5f77fe
ab86642
92bb7ce
d819ace
9533518
7e29e73
7581d1c
7dba5bf
67f025f
2d64808
2c94f96
06f3461
1329a4e
64d7421
e54dcb4
7591e52
9d828f4
7cadb90
512c278
67e45f1
1b52efa
c728e71
be297f4
7ac21fc
5ffa1f6
9831d88
134666b
1f5f75f
b61f2cf
4e79027
74e93b7
8638043
fe6cb92
c72414d
bdd2a12
06b466f
547210c
ac295a0
79fc924
fda4446
5142b7a
0144b38
d51242b
30326a8
d40a91a
7ef0cb6
93d0473
7738c28
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include "absl/strings/str_split.h" | ||
| #include "gflags/gflags.h" | ||
| #include "nlohmann/json.hpp" | ||
| #include "ray/common/asio/instrumented_io_context.h" | ||
|
|
@@ -143,6 +144,10 @@ DEFINE_int64(system_reserved_memory_bytes, | |
| "be applied as a memory.min constraint to the system cgroup. If " | ||
| "enable-resource-isolation is true, then this cannot be -1"); | ||
|
|
||
| DEFINE_string(system_pids, | ||
| "", | ||
| "A comma-separated list of pids to move into the system cgroup."); | ||
|
|
||
| absl::flat_hash_map<std::string, std::string> parse_node_labels( | ||
| const std::string &labels_json_str) { | ||
| absl::flat_hash_map<std::string, std::string> labels; | ||
|
|
@@ -253,6 +258,7 @@ int main(int argc, char *argv[]) { | |
| const std::string cgroup_path = FLAGS_cgroup_path; | ||
| const int64_t system_reserved_cpu_weight = FLAGS_system_reserved_cpu_weight; | ||
| const int64_t system_reserved_memory_bytes = FLAGS_system_reserved_memory_bytes; | ||
| const std::string system_pids = FLAGS_system_pids; | ||
|
|
||
| RAY_CHECK_NE(FLAGS_cluster_id, "") << "Expected cluster ID."; | ||
| ray::ClusterID cluster_id = ray::ClusterID::FromHex(FLAGS_cluster_id); | ||
|
|
@@ -271,10 +277,11 @@ int main(int argc, char *argv[]) { | |
| "system_reserved_cpu_weight must be set to a value between [1,10000]"; | ||
| RAY_CHECK_NE(system_reserved_memory_bytes, -1) | ||
| << "Failed to start up raylet. If enable_resource_isolation is set to true, " | ||
| "system_reserved_memory_byres must be set to a value > 0"; | ||
| "system_reserved_memory_bytes must be set to a value > 0"; | ||
|
|
||
| std::unique_ptr<ray::SysFsCgroupDriver> cgroup_driver = | ||
| std::make_unique<ray::SysFsCgroupDriver>(); | ||
|
|
||
| ray::StatusOr<std::unique_ptr<ray::CgroupManager>> cgroup_manager_s = | ||
| ray::CgroupManager::Create(std::move(cgroup_path), | ||
| node_id, | ||
|
|
@@ -294,6 +301,20 @@ int main(int argc, char *argv[]) { | |
| << "Resource isolation with cgroups is only supported in linux. Please set " | ||
| "enable_resource_isolation to false. This is likely a misconfiguration."; | ||
| #endif | ||
|
|
||
| // Move system processes into the system cgroup. | ||
| std::vector<std::string> system_pids_to_move = absl::StrSplit(system_pids, ","); | ||
| system_pids_to_move.emplace_back(std::to_string(ray::GetPID())); | ||
| for (const auto &pid : system_pids_to_move) { | ||
| ray::Status s = cgroup_manager->AddProcessToSystemCgroup(pid); | ||
| // TODO(#54703): This could be upgraded to a RAY_CHECK. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if this should be a RAY_CHECK or not yet. CgroupManager::AddProcessToSystemCgroup already fails a RAY_CHECK if the cgroup doesn't exist or permissions are wrong. The only other errors that are possible here are if the pid is invalid or the process doesn't exist. I think there's a very strong case to be made that if a system process does not exist when you try to move it into the cgroup, we should fail fast.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Considering that pids are passed to the raylet by argument I think the case for ray_check is strong. Let's do it.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1, |
||
| if (!s.ok()) { | ||
| RAY_LOG(WARNING) << absl::StrFormat( | ||
| "Failed to move process %s into system cgroup with error %s", | ||
| pid, | ||
| s.ToString()); | ||
| } | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // Configuration for the node manager. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.