Skip to content

Commit 45f4633

Browse files
kfstormRohan138
authored andcommitted
[Core] No RAY_LOG in the constructor of DelayManager (ray-project#26958)
We encountered SIGSEGV when running Python test `python/ray/tests/test_failure_2.py::test_list_named_actors_timeout`. The stack is: ``` #0 0x00007fffed30f393 in std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&) () from /lib64/libstdc++.so.6 ray-project#1 0x00007fffee707649 in ray::RayLog::GetLoggerName() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so ray-project#2 0x00007fffee70aa90 in ray::SpdLogMessage::Flush() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so ray-project#3 0x00007fffee70af28 in ray::RayLog::~RayLog() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so ray-project#4 0x00007fffee2b570d in ray::asio::testing::(anonymous namespace)::DelayManager::Init() [clone .constprop.0] () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so ray-project#5 0x00007fffedd0d95a in _GLOBAL__sub_I_asio_chaos.cc () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so ray-project#6 0x00007ffff7fe282a in call_init.part () from /lib64/ld-linux-x86-64.so.2 ray-project#7 0x00007ffff7fe2931 in _dl_init () from /lib64/ld-linux-x86-64.so.2 ray-project#8 0x00007ffff7fe674c in dl_open_worker () from /lib64/ld-linux-x86-64.so.2 ray-project#9 0x00007ffff7b82e79 in _dl_catch_exception () from /lib64/libc.so.6 ray-project#10 0x00007ffff7fe5ffe in _dl_open () from /lib64/ld-linux-x86-64.so.2 ray-project#11 0x00007ffff7d5f39c in dlopen_doit () from /lib64/libdl.so.2 ray-project#12 0x00007ffff7b82e79 in _dl_catch_exception () from /lib64/libc.so.6 ray-project#13 0x00007ffff7b82f13 in _dl_catch_error () from /lib64/libc.so.6 ray-project#14 0x00007ffff7d5fb09 in _dlerror_run () from /lib64/libdl.so.2 ray-project#15 0x00007ffff7d5f42a in dlopen@@GLIBC_2.2.5 () from /lib64/libdl.so.2 ray-project#16 0x00007fffef04d330 in py_dl_open (self=<optimized out>, args=<optimized out>) at /tmp/python-build.20220507135524.257789/Python-3.7.11/Modules/_ctypes/callproc.c:1369 ``` The root cause is that when loading `_raylet.so`, `static DelayManager _delay_manager` is initialized and `RAY_LOG(ERROR) << "RAY_testing_asio_delay_us is set to " << delay_env;` is executed. However, the static variables declared in `logging.cc` are not initialized yet (in this case, `std::string RayLog::logger_name_ = "ray_log_sink"`). It's better not to rely on the initialization order of static variables in different compilation units because it's not guaranteed. I propose to change all `RAY_LOG`s to `std::cerr` in `DelayManager::Init()`. The crash happens in Ant's internal codebase. Not sure why this test case passes in the community version though. BTW, I've tried different approaches: 1. Using a static local variable in `get_delay_us` and remove the global variable. This doesn't work because `init()` needs to access the variable as well. 2. Defining the global variable as type `std::unique_ptr<DelayManager>` and initialize it in `get_delay_us`. This works but it requires a lock to be thread-safe. Signed-off-by: Rohan138 <[email protected]>
1 parent 69e77b3 commit 45f4633

File tree

2 files changed

+18
-10
lines changed

2 files changed

+18
-10
lines changed

src/ray/common/asio/asio_chaos.cc

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include "absl/strings/numbers.h"
2222
#include "absl/strings/str_split.h"
2323
#include "ray/common/ray_config.h"
24-
#include "ray/util/logging.h"
2524

2625
namespace ray {
2726
namespace asio {
@@ -65,7 +64,7 @@ class DelayManager {
6564
if (delay_env.empty()) {
6665
return;
6766
}
68-
RAY_LOG(ERROR) << "RAY_testing_asio_delay_us is set to " << delay_env;
67+
std::cerr << "RAY_testing_asio_delay_us is set to " << delay_env << std::endl;
6968
std::vector<std::string_view> items = absl::StrSplit(delay_env, ",");
7069
for (const auto &item : items) {
7170
ParseItem(item);
@@ -77,8 +76,9 @@ class DelayManager {
7776
void ParseItem(std::string_view val) {
7877
std::vector<std::string_view> item_val = absl::StrSplit(val, "=");
7978
if (item_val.size() != 2) {
80-
RAY_LOG(FATAL) << "Error in syntax: " << val
81-
<< ", expected method=min_us:max:ms. Skip this entry.";
79+
std::cerr << "Error in syntax: " << val
80+
<< ", expected method=min_us:max:ms. Skip this entry." << std::endl;
81+
_Exit(1);
8282
}
8383
auto delay_us = ParseVal(item_val[1]);
8484
if (item_val[0] == "*") {
@@ -91,18 +91,21 @@ class DelayManager {
9191
std::pair<int64_t, int64_t> ParseVal(std::string_view val) {
9292
std::vector<std::string_view> delay_str_us = absl::StrSplit(val, ":");
9393
if (delay_str_us.size() != 2) {
94-
RAY_LOG(FATAL) << "Error in syntax: " << val
95-
<< ", expected method=min_us:max:ms. Skip this entry";
94+
std::cerr << "Error in syntax: " << val
95+
<< ", expected method=min_us:max:ms. Skip this entry" << std::endl;
96+
_Exit(1);
9697
}
9798
std::pair<int64_t, int64_t> delay_us;
9899
if (!absl::SimpleAtoi(delay_str_us[0], &delay_us.first) ||
99100
!absl::SimpleAtoi(delay_str_us[1], &delay_us.second)) {
100-
RAY_LOG(FATAL) << "Error in syntax: " << val
101-
<< ", expected method=min_us:max:ms. Skip this entry";
101+
std::cerr << "Error in syntax: " << val
102+
<< ", expected method=min_us:max:ms. Skip this entry" << std::endl;
103+
_Exit(1);
102104
}
103105
if (delay_us.first > delay_us.second) {
104-
RAY_LOG(FATAL) << delay_us.first << " is bigger than " << delay_us.second
105-
<< ". Skip this entry.";
106+
std::cerr << delay_us.first << " is bigger than " << delay_us.second
107+
<< ". Skip this entry." << std::endl;
108+
_Exit(1);
106109
}
107110
return delay_us;
108111
}

src/ray/common/test/asio_defer_test.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,8 @@ TEST(AsioChaosTest, WithGlobal) {
4343
ASSERT_TRUE(EnsureBelow("method2", 20, 30));
4444
ASSERT_TRUE(EnsureBelow("others", 100, 200));
4545
}
46+
47+
int main(int argc, char **argv) {
48+
::testing::InitGoogleTest(&argc, argv);
49+
return RUN_ALL_TESTS();
50+
}

0 commit comments

Comments
 (0)