diff --git a/projects/hipblaslt/clients/tests/src/CMakeLists.txt b/projects/hipblaslt/clients/tests/src/CMakeLists.txt index c4d04cb099e5..1bab31d33009 100755 --- a/projects/hipblaslt/clients/tests/src/CMakeLists.txt +++ b/projects/hipblaslt/clients/tests/src/CMakeLists.txt @@ -4,6 +4,7 @@ target_sources(hipblaslt-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_gtest_main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_parallel_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/matmul_gtest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/auxiliary_gtest.cpp diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp index 716db24c1f3b..80a21cabddf0 100644 --- a/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp +++ b/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp @@ -29,7 +29,12 @@ #include "hipblaslt_test.hpp" #include "test_cleanup.hpp" #include "utility.hpp" +#ifndef _WIN32 +#include "hipblaslt_parallel_test.hpp" +#endif +#include #include +#include using namespace testing; @@ -37,6 +42,7 @@ class ConfigurableEventListener : public TestEventListener { TestEventListener* const eventListener; std::atomic_size_t skipped_tests{0}; // Number of skipped tests. + std::atomic_size_t current_test_number{0}; // Current test number (incremental counter). public: bool showTestCases = true; // Show the names of each test case. @@ -86,8 +92,14 @@ class ConfigurableEventListener : public TestEventListener void OnTestStart(const TestInfo& test_info) override { + ++current_test_number; if(showTestNames) + { + // Print test number and delegate to default listener + int total_tests = UnitTest::GetInstance()->test_to_run_count(); + hipblaslt_cout << "[Test #" << current_test_number << "/" << total_tests << "] " << std::flush; eventListener->OnTestStart(test_info); + } } void OnTestPartResult(const TestPartResult& result) override @@ -228,6 +240,118 @@ int main(int argc, char** argv) { std::string args = hipblaslt_capture_args(argc, argv); + // Check for --help to add our custom options + for(int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + if(arg == "--help" || arg == "-h" || arg == "-?" || arg == "/?" || arg == "--help-all") + { + hipblaslt_cout << "\nhipBLASLt Test Options:\n"; + hipblaslt_cout << " --num_gpus=N\n"; + hipblaslt_cout << " --num_gpus N\n"; + hipblaslt_cout << " Run tests in parallel across N GPUs (Unix/Linux only).\n"; + hipblaslt_cout << " Tests are automatically split evenly across the specified\n"; + hipblaslt_cout << " number of GPUs. Each GPU runs its assigned tests independently.\n"; + hipblaslt_cout << " Example: ./hipblaslt-test --num_gpus 8 --gtest_filter=\"*smoke*\"\n"; + hipblaslt_cout << " Note: If --gtest_output=json:file.json is specified, per-GPU\n"; + hipblaslt_cout << " results are saved as file_gpu0.json, file_gpu1.json, etc.\n"; + hipblaslt_cout << "\n"; + break; + } + } + + // Parse and strip --num_gpus argument + int num_gpus = 0; + bool has_num_gpus_flag = false; + std::vector indices_to_remove; // Track all indices to remove + + for(int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + if(arg.find("--num_gpus=") == 0) + { + num_gpus = std::atoi(arg.substr(11).c_str()); + has_num_gpus_flag = true; + indices_to_remove.push_back(i); + break; + } + else if(arg == "--num_gpus" && i + 1 < argc) + { + num_gpus = std::atoi(argv[i + 1]); + has_num_gpus_flag = true; + indices_to_remove.push_back(i); + indices_to_remove.push_back(i + 1); + break; + } + } + + // Parse and strip --gtest_output to extract base filename + std::string gtest_output_base; + for(int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + if(arg.find("--gtest_output=") == 0) + { + size_t colon_pos = arg.find(":"); + if(colon_pos != std::string::npos) + { + // Format: --gtest_output=json:file.json + std::string format = arg.substr(15, colon_pos - 15); + std::string filename = arg.substr(colon_pos + 1); + gtest_output_base = format + ":" + filename; + } + else + { + // Format: --gtest_output=json (uses default filename) + std::string format = arg.substr(15); + if(format == "json") + { + gtest_output_base = "json:test_detail.json"; // GTest default + } + else + { + gtest_output_base = format; // Pass through other formats + } + } + indices_to_remove.push_back(i); + break; + } + } + +#ifdef _WIN32 + // On Windows, parallel GPU execution is not supported + if(has_num_gpus_flag) + { + hipblaslt_cerr << "Error: --num_gpus is not supported on Windows." << std::endl; + return 1; + } +#else + // Check for invalid --num_gpus values + if(has_num_gpus_flag && num_gpus <= 1) + { + hipblaslt_cerr << "Error: --num_gpus requires a value greater than 1." << std::endl; + return 1; + } + + // If parallel GPUs requested, use parallel execution + if(num_gpus > 1) + { + // Remove custom flags from argv before passing to parallel runner + // Sort indices in descending order to remove from back to front + std::sort(indices_to_remove.begin(), indices_to_remove.end(), std::greater()); + for(int idx : indices_to_remove) + { + for(int i = idx; i + 1 < argc; i++) + { + argv[i] = argv[i + 1]; + } + argc--; + } + + return run_tests_parallel_gpus(argc, argv, num_gpus, gtest_output_base); + } +#endif + // Set signal handler hipblaslt_test_sigaction(); @@ -256,7 +380,7 @@ int main(int argc, char** argv) // Failures printed at end for reporting so repeat version info hipblaslt_print_version(); - // end test results with command line + // Print command line at the end hipblaslt_print_args(args); //hipblaslt_shutdown(); diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp new file mode 100644 index 000000000000..ab3a20105a55 --- /dev/null +++ b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp @@ -0,0 +1,419 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "hipblaslt_parallel_test.hpp" +#include "utility.hpp" + +#ifndef _WIN32 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Run a single GPU shard as a child process +// This function sets up env vars, redirects output, and execs the test binary +[[noreturn]] void run_child_shard(int argc, char** argv, int gpu, int num_gpus, + const std::string& log_file, + const std::string& gtest_output_base) +{ + // Set which GPU to use + std::string gpu_env = std::to_string(gpu); + setenv("HIP_VISIBLE_DEVICES", gpu_env.c_str(), 1); + + // Set optimal OpenMP threads per GPU process + const char* env_threads = getenv("OMP_NUM_THREADS"); + int current_threads = env_threads ? std::atoi(env_threads) : std::thread::hardware_concurrency(); + int threads_per_gpu = std::max(1, current_threads / num_gpus); + setenv("OMP_NUM_THREADS", std::to_string(threads_per_gpu).c_str(), 1); + + // Use Google Test's built-in sharding + setenv("GTEST_TOTAL_SHARDS", std::to_string(num_gpus).c_str(), 1); + setenv("GTEST_SHARD_INDEX", std::to_string(gpu).c_str(), 1); + + // Redirect output to log file to avoid interleaved output + int fd = open(log_file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if(fd >= 0) + { + dup2(fd, STDOUT_FILENO); + dup2(fd, STDERR_FILENO); + close(fd); + } + + // Build argv - argv is already clean (custom flags removed by main) + std::vector new_argv; + std::vector arg_storage; // Store modified arguments + + new_argv.push_back(argv[0]); + + // Pass through all arguments from the clean argv + for(int i = 1; i < argc; i++) + { + new_argv.push_back(argv[i]); + } + + // Add per-GPU gtest_output if it was specified + if(!gtest_output_base.empty()) + { + size_t colon_pos = gtest_output_base.find(":"); + if(colon_pos != std::string::npos) + { + // Format: json:file.json + std::string format = gtest_output_base.substr(0, colon_pos); + std::string filename = gtest_output_base.substr(colon_pos + 1); + + // Insert GPU number before file extension + size_t dot_pos = filename.rfind("."); + std::string new_filename; + if(dot_pos != std::string::npos) + { + new_filename = filename.substr(0, dot_pos) + "_gpu" + + std::to_string(gpu) + filename.substr(dot_pos); + } + else + { + new_filename = filename + "_gpu" + std::to_string(gpu); + } + + std::string new_output_arg = "--gtest_output=" + format + ":" + new_filename; + arg_storage.push_back(new_output_arg); + new_argv.push_back(arg_storage.back().c_str()); + } + else + { + // Format without colon, just pass through with default filename + std::string new_output_arg = "--gtest_output=" + gtest_output_base; + arg_storage.push_back(new_output_arg); + new_argv.push_back(arg_storage.back().c_str()); + } + } + + new_argv.push_back(nullptr); + + // Execute the test binary + execvp(argv[0], const_cast(new_argv.data())); + + // If exec fails + hipblaslt_cerr << "Failed to exec for GPU " << gpu << std::endl; + exit(1); +} + +// Function to run tests in parallel across multiple GPUs +// argc/argv should already have --num_gpus and --gtest_output stripped +int run_tests_parallel_gpus(int argc, char** argv, int num_gpus, + const std::string& gtest_output_base) +{ + hipblaslt_cout << "\n========================================" << std::endl; + hipblaslt_cout << "Parallel GPU Execution Mode" << std::endl; + hipblaslt_cout << "Running tests across " << num_gpus << " GPUs" << std::endl; + hipblaslt_cout << "========================================\n" << std::endl; + + // Check available GPUs + int available_gpus = 0; + if(hipGetDeviceCount(&available_gpus) != hipSuccess || available_gpus < 1) + { + hipblaslt_cerr << "Error: No GPUs detected" << std::endl; + return 1; + } + + if(num_gpus > available_gpus) + { + hipblaslt_cerr << "Warning: Requested " << num_gpus << " GPUs but only " + << available_gpus << " available. Using " << available_gpus << " GPUs." + << std::endl; + num_gpus = available_gpus; + } + + // Display sharding information + hipblaslt_cout << "Tests will be sharded across " << num_gpus << " GPUs" << std::endl; + + // Calculate and display OpenMP thread distribution + const char* env_threads = getenv("OMP_NUM_THREADS"); + int current_threads = env_threads ? std::atoi(env_threads) : std::thread::hardware_concurrency(); + int threads_per_gpu = std::max(1, current_threads / num_gpus); + hipblaslt_cout << "OpenMP threads per GPU: " << threads_per_gpu + << " (total available: " << current_threads << ")" << std::endl; + hipblaslt_cout << std::endl; + + // Split tests across GPUs using Google Test's built-in sharding + std::vector child_pids; + std::vector output_files; + bool fork_failed = false; + + for(int gpu = 0; gpu < num_gpus; gpu++) + { + std::string output_file = "/tmp/hipblaslt_gpu" + std::to_string(gpu) + "_" + + std::to_string(getpid()) + ".log"; + output_files.push_back(output_file); + + hipblaslt_cout << "GPU " << gpu << ": Starting shard " << gpu << std::endl; + + pid_t pid = fork(); + if(pid == 0) + { + // Child process + run_child_shard(argc, argv, gpu, num_gpus, output_file, gtest_output_base); + } + else if(pid > 0) + { + // Parent process + child_pids.push_back(pid); + } + else + { + // Fork failed - need to clean up already-forked children + hipblaslt_cerr << "Error: Failed to fork for GPU " << gpu << std::endl; + fork_failed = true; + break; + } + } + + // If fork failed, terminate and wait for any already-started children + if(fork_failed) + { + hipblaslt_cerr << "Terminating " << child_pids.size() << " already-started child processes..." << std::endl; + + // Send SIGTERM to all children + for(pid_t pid : child_pids) + { + kill(pid, SIGTERM); + } + + // Wait for all children to terminate + for(pid_t pid : child_pids) + { + int status; + waitpid(pid, &status, 0); + } + + return 1; + } + + // Wait for all children and collect results (without printing yet) + hipblaslt_cout << "\nWaiting for all GPUs to complete..." << std::endl; + + int total_exit_code = 0; + int gpus_passed = 0; + int gpus_failed = 0; + int total_tests_ran = 0; + int total_tests_passed = 0; + int total_tests_failed = 0; + std::vector exit_codes(child_pids.size()); + std::vector normal_exit(child_pids.size()); + std::vector gpu_summaries(child_pids.size()); + std::vector gpu_tests_ran(child_pids.size(), 0); + std::vector gpu_tests_passed(child_pids.size(), 0); + std::vector gpu_tests_failed(child_pids.size(), 0); + std::vector gpu_time_ms(child_pids.size(), 0.0); + + // Wait for ALL children first before printing any results + for(size_t i = 0; i < child_pids.size(); i++) + { + int status; + waitpid(child_pids[i], &status, 0); + + if(WIFEXITED(status)) + { + normal_exit[i] = true; + exit_codes[i] = WEXITSTATUS(status); + if(exit_codes[i] == 0) + { + gpus_passed++; + } + else + { + gpus_failed++; + total_exit_code = 1; + } + } + else + { + normal_exit[i] = false; + exit_codes[i] = -1; + gpus_failed++; + total_exit_code = 1; + } + + // Extract summary from log file and parse test counts + std::ifstream log_file(output_files[i]); + if(log_file.is_open()) + { + std::string line; + std::vector summary_lines; + bool in_summary = false; + + while(std::getline(log_file, line)) + { + // Parse total tests ran: "[==========] 87 tests from 1 test suite ran." + if(line.find("[==========]") != std::string::npos && line.find("tests") != std::string::npos && line.find("ran.") != std::string::npos) + { + in_summary = true; + // Extract number of tests + size_t pos = line.find("]"); + if(pos != std::string::npos) + { + std::istringstream iss(line.substr(pos + 1)); + iss >> gpu_tests_ran[i]; + } + } + + // Parse passed tests: "[ PASSED ] 87 tests." + if(line.find("[ PASSED ]") != std::string::npos && line.find("tests") != std::string::npos) + { + size_t pos = line.find("]"); + if(pos != std::string::npos) + { + std::istringstream iss(line.substr(pos + 1)); + iss >> gpu_tests_passed[i]; + } + } + + // Parse failed tests: "[ FAILED ] 2 tests, listed below:" + if(line.find("[ FAILED ]") != std::string::npos && line.find("tests") != std::string::npos) + { + size_t pos = line.find("]"); + if(pos != std::string::npos) + { + std::istringstream iss(line.substr(pos + 1)); + iss >> gpu_tests_failed[i]; + } + } + + // Parse time: "[==========] 87 tests from 1 test suite ran. (1234 ms total)" + if(line.find("[==========]") != std::string::npos && line.find("ms total") != std::string::npos) + { + size_t pos = line.find("("); + if(pos != std::string::npos) + { + std::istringstream iss(line.substr(pos + 1)); + iss >> gpu_time_ms[i]; + } + } + + if(in_summary) + { + summary_lines.push_back(line); + } + } + log_file.close(); + + // Store the summary for this GPU + if(!summary_lines.empty()) + { + for(const auto& summary_line : summary_lines) + { + gpu_summaries[i] += summary_line + "\n"; + } + } + + // Accumulate totals + total_tests_ran += gpu_tests_ran[i]; + total_tests_passed += gpu_tests_passed[i]; + total_tests_failed += gpu_tests_failed[i]; + } + } + + // Now print all results together + hipblaslt_cout << "\n========================================" << std::endl; + hipblaslt_cout << "Parallel GPU Test Summary" << std::endl; + hipblaslt_cout << "========================================\n" << std::endl; + + // Print individual GPU results with their GTest summaries + for(size_t i = 0; i < child_pids.size(); i++) + { + hipblaslt_cout << "GPU " << i << ":" << std::endl; + + if(!gpu_summaries[i].empty()) + { + hipblaslt_cout << gpu_summaries[i]; + } + else if(normal_exit[i]) + { + hipblaslt_cout << " Exit code: " << exit_codes[i] << std::endl; + } + else + { + hipblaslt_cout << " Terminated abnormally" << std::endl; + } + hipblaslt_cout << std::endl; + } + + // Calculate average time + double total_time_ms = 0.0; + int valid_times = 0; + for(size_t i = 0; i < gpu_time_ms.size(); i++) + { + if(gpu_time_ms[i] > 0.0) + { + total_time_ms += gpu_time_ms[i]; + valid_times++; + } + } + double avg_time_ms = (valid_times > 0) ? (total_time_ms / valid_times) : 0.0; + + hipblaslt_cout << "----------------------------------------" << std::endl; + hipblaslt_cout << "OVERALL SUMMARY (across all GPUs):" << std::endl; + hipblaslt_cout << "Total tests run: " << total_tests_ran << std::endl; + hipblaslt_cout << "Total PASSED: " << total_tests_passed << std::endl; + hipblaslt_cout << "Total FAILED: " << total_tests_failed << std::endl; + if(avg_time_ms > 0.0) + { + hipblaslt_cout << "Average time: " << avg_time_ms << " ms" << std::endl; + } + hipblaslt_cout << "\nGPU Summary:" << std::endl; + hipblaslt_cout << "GPUs used: " << num_gpus << std::endl; + hipblaslt_cout << "GPUs passed: " << gpus_passed << std::endl; + hipblaslt_cout << "GPUs failed: " << gpus_failed << std::endl; + + // Clean up log files on success to avoid accumulation in /tmp on CI systems + if(total_exit_code == 0) + { + hipblaslt_cout << "\nAll tests passed - cleaning up log files" << std::endl; + for(size_t i = 0; i < output_files.size(); i++) + { + remove(output_files[i].c_str()); + } + } + else + { + hipblaslt_cout << "\nLog files saved in:" << std::endl; + for(size_t i = 0; i < output_files.size(); i++) + { + hipblaslt_cout << " GPU " << i << ": " << output_files[i] << std::endl; + } + } + hipblaslt_cout << "========================================\n" << std::endl; + + return total_exit_code; +} +#endif // _WIN32 diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp new file mode 100644 index 000000000000..b96a8fb0f39b --- /dev/null +++ b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp @@ -0,0 +1,36 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#ifndef _WIN32 +// Run tests in parallel across multiple GPUs using process-level parallelism +// and Google Test's built-in sharding mechanism +// argc/argv should already have --num_gpus and --gtest_output stripped +// gtest_output_base is the parsed output filename (empty if not specified) +int run_tests_parallel_gpus(int argc, char** argv, int num_gpus, + const std::string& gtest_output_base); +#endif diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp index a8538ea957f3..84522a9bbaf2 100644 --- a/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp +++ b/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp @@ -145,6 +145,22 @@ static thread_local struct volatile sig_atomic_t signal; } t_handler; +// Test timeout configuration - initialized from environment variables +static const struct +{ + unsigned seconds; // Timeout duration in seconds (from HIPBLASLT_TEST_TIMEOUT) + bool continue_tests; // Continue to next test on timeout vs abort (from HIPBLASLT_TEST_TIMEOUT_CONTINUE) +} test_timeout_config = [] { + constexpr unsigned DEFAULT_TIMEOUT = 600; + const char* timeout_env = getenv("HIPBLASLT_TEST_TIMEOUT"); + unsigned timeout_val; + + return decltype(test_timeout_config){ + (timeout_env && sscanf(timeout_env, "%u", &timeout_val) == 1) ? timeout_val + : DEFAULT_TIMEOUT, + getenv("HIPBLASLT_TEST_TIMEOUT_CONTINUE") != nullptr}; +}(); + // Signal handler (must have external "C" linkage) extern "C" void hipblaslt_test_signal_handler(int sig) { @@ -162,16 +178,22 @@ extern "C" void hipblaslt_test_signal_handler(int sig) } #ifndef _WIN32 - // If this is an alarm timeout, we abort + // If this is an alarm timeout, check if we should abort or continue if(sig == SIGALRM) { - static constexpr char msg[] - = "\nAborting tests due to an alarm timeout.\n\n" - "This could be due to a deadlock caused by mutexes being left locked\n" - "after a previous test's signal was caught and partially recovered from.\n"; - // We must use write() because it's async-signal-safe and other IO might be blocked - write(STDERR_FILENO, msg, sizeof(msg) - 1); - hipblaslt_abort(); + if(!test_timeout_config.continue_tests) + { + // Original behavior: abort entire test run on timeout + static constexpr char msg[] + = "\nAborting tests due to an alarm timeout.\n\n" + "This could be due to a deadlock caused by mutexes being left locked\n" + "after a previous test's signal was caught and partially recovered from.\n" + "Set HIPBLASLT_TEST_TIMEOUT_CONTINUE=1 to continue with remaining tests instead.\n"; + // We must use write() because it's async-signal-safe and other IO might be blocked + write(STDERR_FILENO, msg, sizeof(msg) - 1); + hipblaslt_abort(); + } + // If continue_tests is true, fall through to treat like other signals } #endif @@ -205,14 +227,6 @@ void hipblaslt_test_sigaction() #endif } -static const unsigned test_timeout = [] { - // Number of seconds each test is allowed to take before all testing is killed. - constexpr unsigned TEST_TIMEOUT = 600; - unsigned timeout; - const char* env = getenv("HIPBLASLT_TEST_TIMEOUT"); - return env && sscanf(env, "%u", &timeout) == 1 ? timeout : TEST_TIMEOUT; -}(); - // Lambda wrapper which detects signals and exceptions in an invokable function void catch_signals_and_exceptions_as_failures(std::function test, bool set_alarm) { @@ -223,11 +237,19 @@ void catch_signals_and_exceptions_as_failures(std::function test, bool s // Set up the return point, and handle siglongjmp returning back to here if(sigsetjmp(t_handler.sigjmp_buf_, true)) { + // Provide clear message for timeout vs other signals + if(t_handler.signal == SIGALRM) + { + FAIL() << "Test exceeded timeout of " << test_timeout_config.seconds << " seconds"; + } + else + { #if (__GLIBC__ < 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 32) - FAIL() << "Received " << sys_siglist[t_handler.signal] << " signal"; + FAIL() << "Received " << sys_siglist[t_handler.signal] << " signal"; #else - FAIL() << "Received " << sigdescr_np(t_handler.signal) << " signal"; + FAIL() << "Received " << sigdescr_np(t_handler.signal) << " signal"; #endif + } } #else if(setjmp(t_handler.sigjmp_buf_)) @@ -240,7 +262,7 @@ void catch_signals_and_exceptions_as_failures(std::function test, bool s #ifndef _WIN32 // Alarm to detect deadlocks or hangs if(set_alarm) - alarm(test_timeout); + alarm(test_timeout_config.seconds); #endif // Enable the signal handler t_handler.enabled = true;