Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .buildkite/core.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ steps:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/_common/tests/... //python/ray/dag/... //python/ray/autoscaler/v2/... core
--install-mask all-ray-libraries
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--except-tags custom_setup
--except-tags custom_setup,cgroup
--install-mask all-ray-libraries

- label: ":ray: core: python {{matrix.python}} tests ({{matrix.worker_id}})"
Expand All @@ -76,7 +76,7 @@ steps:
--install-mask all-ray-libraries
--workers 4 --worker-id "{{matrix.worker_id}}" --parallelism-per-worker 3
--python-version {{matrix.python}}
--except-tags custom_setup
--except-tags custom_setup,cgroup
depends_on: corebuild-multipy
matrix:
setup:
Expand Down Expand Up @@ -105,7 +105,7 @@ steps:
--install-mask all-ray-libraries
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--test-env=TEST_EXTERNAL_REDIS=1
--except-tags custom_setup
--except-tags custom_setup,cgroup

- label: ":ray: core: memory pressure tests"
tags:
Expand Down Expand Up @@ -261,7 +261,7 @@ steps:
--test-env=RAY_MINIMAL=1
--test-env=EXPECTED_PYTHON_VERSION={{matrix}}
--only-tags minimal
--except-tags basic_test,manual
--except-tags basic_test,manual,cgroup
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dashboard/... core
--parallelism-per-worker 3
--python-version {{matrix}}
Expand Down Expand Up @@ -366,7 +366,7 @@ steps:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/... //doc/... core
--install-mask all-ray-libraries
--run-flaky-tests
--except-tags multi_gpu
--except-tags multi_gpu,cgroup
depends_on:
- corebuild

Expand Down
3 changes: 1 addition & 2 deletions python/ray/tests/resource_isolation/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ py_test(
tags = [
"cgroup",
"exclusive",
"manual",
"team:core",
],
target_compatible_with = [
Expand All @@ -33,8 +32,8 @@ py_test(
size = "medium",
srcs = ["test_resource_isolation_config.py"],
tags = [
"cgroup",
"exclusive",
"manual",
"team:core",
],
deps = [
Expand Down
1 change: 0 additions & 1 deletion src/ray/common/cgroup2/cgroup_driver_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ class CgroupDriverInterface {
supported or the value not correct.
*/
virtual Status AddConstraint(const std::string &cgroup,
const std::string &controller,
const std::string &constraint,
const std::string &value) = 0;
/**
Expand Down
14 changes: 4 additions & 10 deletions src/ray/common/cgroup2/cgroup_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,8 @@ void CgroupManager::RegisterRemoveConstraint(const std::string &cgroup,
cleanup_operations_.emplace_back(
[this, constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
std::string default_value = std::to_string(constraint_to_remove.default_value_);
Status s = this->cgroup_driver_->AddConstraint(constrained_cgroup,
constraint_to_remove.controller_,
constraint_to_remove.name_,
default_value);
Status s = this->cgroup_driver_->AddConstraint(
constrained_cgroup, constraint_to_remove.name_, default_value);
if (!s.ok()) {
RAY_LOG(WARNING) << absl::StrFormat(
"Failed to set constraint %s=%s to default value for cgroup %s with error "
Expand Down Expand Up @@ -292,22 +290,18 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,

RAY_RETURN_NOT_OK(
cgroup_driver_->AddConstraint(system_cgroup_,
cpu_weight_constraint_.controller_,
cpu_weight_constraint_.name_,
std::to_string(system_reserved_cpu_weight)));
RegisterRemoveConstraint(system_cgroup_, cpu_weight_constraint_);

RAY_RETURN_NOT_OK(
cgroup_driver_->AddConstraint(system_cgroup_,
memory_min_constraint_.controller_,
memory_min_constraint_.name_,
std::to_string(system_reserved_memory_bytes)));
RegisterRemoveConstraint(system_cgroup_, memory_min_constraint_);

RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(user_cgroup_,
cpu_weight_constraint_.controller_,
cpu_weight_constraint_.name_,
std::to_string(user_cpu_weight)));
RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(
user_cgroup_, cpu_weight_constraint_.name_, std::to_string(user_cpu_weight)));
RegisterRemoveConstraint(user_cgroup_, cpu_weight_constraint_);

return Status::OK();
Expand Down
1 change: 0 additions & 1 deletion src/ray/common/cgroup2/fake_cgroup_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ class FakeCgroupDriver : public CgroupDriverInterface {
}

Status AddConstraint(const std::string &cgroup,
const std::string &controller,
const std::string &constraint,
const std::string &value) override {
if (!add_constraint_s_.ok()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintFailsIfCgroupDoesn
std::string non_existent_path =
test_cgroup_path_ + std::filesystem::path::preferred_separator + "nope";
SysFsCgroupDriver driver;
Status s = driver.AddConstraint(non_existent_path, "memory", "memory.min", "1");
Status s = driver.AddConstraint(non_existent_path, "memory.min", "1");
ASSERT_TRUE(s.IsNotFound()) << s.ToString();
}

Expand All @@ -584,7 +584,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest,
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
auto cgroup = std::move(cgroup_or_status.value());
SysFsCgroupDriver driver;
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1");
ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString();
}

Expand All @@ -595,21 +595,10 @@ TEST_F(SysFsCgroupDriverIntegrationTest,
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
auto cgroup = std::move(cgroup_or_status.value());
SysFsCgroupDriver driver;
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
Status s = driver.AddConstraint(cgroup->GetPath(), "memory.min", "1");
ASSERT_TRUE(s.IsPermissionDenied()) << s.ToString();
}

TEST_F(SysFsCgroupDriverIntegrationTest,
AddResourceConstraintFailsIfControllerNotEnabled) {
auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU);
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
auto cgroup = std::move(cgroup_or_status.value());
SysFsCgroupDriver driver;
// Memory controller is not enabled.
Status s = driver.AddConstraint(cgroup->GetPath(), "memory", "memory.min", "1");
ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
}

TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) {
auto cgroup_or_status = TempCgroupDirectory::Create(test_cgroup_path_, S_IRWXU);
ASSERT_TRUE(cgroup_or_status.ok()) << cgroup_or_status.ToString();
Expand All @@ -619,7 +608,7 @@ TEST_F(SysFsCgroupDriverIntegrationTest, AddResourceConstraintSucceeds) {
Status enable_controller_s = driver.EnableController(cgroup->GetPath(), "cpu");
ASSERT_TRUE(enable_controller_s.ok()) << enable_controller_s.ToString();
// cpu.weight must be between [1,10000]
Status s = driver.AddConstraint(cgroup->GetPath(), "cpu", "cpu.weight", "500");
Status s = driver.AddConstraint(cgroup->GetPath(), "cpu.weight", "500");
ASSERT_TRUE(s.ok()) << s.ToString();
}

Expand Down
16 changes: 2 additions & 14 deletions src/ray/common/cgroup2/sysfs_cgroup_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -331,24 +331,12 @@ Status SysFsCgroupDriver::DisableController(const std::string &cgroup_path,
return Status::OK();
}

// What's the right thing here? If the controller is specified?
// The correct API would be specify where the controller should be enabled.
Status SysFsCgroupDriver::AddConstraint(const std::string &cgroup_path,
const std::string &controller,
const std::string &constraint,
const std::string &constraint_value) {
RAY_RETURN_NOT_OK(CheckCgroup(cgroup_path));
// Check if the required controller for the constraint is enabled.
StatusOr<std::unordered_set<std::string>> available_controllers_s =
GetEnabledControllers(cgroup_path);
RAY_RETURN_NOT_OK(available_controllers_s.status());
const auto &controllers = available_controllers_s.value();
if (controllers.find(controller) == controllers.end()) {
return Status::InvalidArgument(absl::StrFormat(
"Failed to apply %s to cgroup %s. To use %s, enable the %s controller.",
constraint,
cgroup_path,
constraint,
controller));
}
Comment on lines -339 to -351
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I wrote this, I assumed that the controller needs to be enabled on the cgroup to apply a constraint. This isn't always the case (e.g. the cpu controller needs to be enabled on a parent cgroup) and the constraint is applied to the child cgroup.

I could keep the check and complicate the API (e.g. pass in the path of the cgroup on which to check that the controller is enabled), but I think it's cleaner for the caller to check first and then enable the constraint. This is what CgroupManager does.


// Try to apply the constraint and propagate the appropriate failure error.
std::string file_path =
Expand Down
6 changes: 2 additions & 4 deletions src/ray/common/cgroup2/sysfs_cgroup_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,19 +236,17 @@ class SysFsCgroupDriver : public CgroupDriverInterface {
Adds a constraint to the respective cgroup file.

@param cgroup_path absolute path of the cgroup.
@param controller the name of the controller
@param constraint the name of the cgroup file to add the constraint to e.g. cpu.weight
@param constraint_value

@return Status::OK if no errors are encounted.
@return Status::NotFound if the cgroup does not exist.
@return Status::PermissionDenied if current user doesn't have read, write, and execute
permissions.
@return Status::InvalidArgument if the cgroup is not using cgroupv2, controller is not
enabled, or cannot write to the constraint file.
@return Status::InvalidArgument if the cgroup is not using cgroupv2, or cannot write
to the constraint file.
*/
Status AddConstraint(const std::string &cgroup,
const std::string &controller,
const std::string &constraint,
const std::string &constraint_value) override;

Expand Down
2 changes: 1 addition & 1 deletion src/ray/common/cgroup2/tests/sysfs_cgroup_driver_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ TEST(SysFsCgroupDriver, AddConstraintFailsIfNotCgroupv2Path) {
ASSERT_TRUE(temp_dir_or_status.ok()) << temp_dir_or_status.ToString();
std::unique_ptr<TempDirectory> temp_dir = std::move(temp_dir_or_status.value());
SysFsCgroupDriver driver;
Status s = driver.AddConstraint(temp_dir->GetPath(), "memory", "memory.min", "1");
Status s = driver.AddConstraint(temp_dir->GetPath(), "memory.min", "1");
ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
}

Expand Down