diff --git a/projects/hipblaslt/clients/tests/src/CMakeLists.txt b/projects/hipblaslt/clients/tests/src/CMakeLists.txt
index c4d04cb099e5..1bab31d33009 100755
--- a/projects/hipblaslt/clients/tests/src/CMakeLists.txt
+++ b/projects/hipblaslt/clients/tests/src/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(hipblaslt-test
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_gtest_main.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_parallel_test.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/hipblaslt_test.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/matmul_gtest.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/auxiliary_gtest.cpp
diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp
index 716db24c1f3b..80a21cabddf0 100644
--- a/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp
+++ b/projects/hipblaslt/clients/tests/src/hipblaslt_gtest_main.cpp
@@ -29,7 +29,12 @@
 #include "hipblaslt_test.hpp"
 #include "test_cleanup.hpp"
 #include "utility.hpp"
+#ifndef _WIN32
+#include "hipblaslt_parallel_test.hpp"
+#endif
+#include <algorithm>
 #include <string>
+#include <vector>
 
 using namespace testing;
 
@@ -37,6 +42,7 @@ class ConfigurableEventListener : public TestEventListener
 {
     TestEventListener* const eventListener;
     std::atomic_size_t       skipped_tests{0}; // Number of skipped tests.
+    std::atomic_size_t       current_test_number{0}; // Current test number (incremental counter).
 
 public:
     bool showTestCases      = true; // Show the names of each test case.
@@ -86,8 +92,14 @@ class ConfigurableEventListener : public TestEventListener
 
     void OnTestStart(const TestInfo& test_info) override
     {
+        ++current_test_number;
         if(showTestNames)
+        {
+            // Print test number and delegate to default listener
+            int total_tests = UnitTest::GetInstance()->test_to_run_count();
+            hipblaslt_cout << "[Test #" << current_test_number << "/" << total_tests << "] " << std::flush;
             eventListener->OnTestStart(test_info);
+        }
     }
 
     void OnTestPartResult(const TestPartResult& result) override
@@ -228,6 +240,118 @@ int main(int argc, char** argv)
 {
     std::string args = hipblaslt_capture_args(argc, argv);
 
+    // Check for --help to add our custom options
+    for(int i = 1; i < argc; i++)
+    {
+        std::string arg = argv[i];
+        if(arg == "--help" || arg == "-h" || arg == "-?" || arg == "/?" || arg == "--help-all")
+        {
+            hipblaslt_cout << "\nhipBLASLt Test Options:\n";
+            hipblaslt_cout << "  --num_gpus=N\n";
+            hipblaslt_cout << "  --num_gpus N\n";
+            hipblaslt_cout << "      Run tests in parallel across N GPUs (Unix/Linux only).\n";
+            hipblaslt_cout << "      Tests are automatically split evenly across the specified\n";
+            hipblaslt_cout << "      number of GPUs. Each GPU runs its assigned tests independently.\n";
+            hipblaslt_cout << "      Example: ./hipblaslt-test --num_gpus 8 --gtest_filter=\"*smoke*\"\n";
+            hipblaslt_cout << "      Note: If --gtest_output=json:file.json is specified, per-GPU\n";
+            hipblaslt_cout << "            results are saved as file_gpu0.json, file_gpu1.json, etc.\n";
+            hipblaslt_cout << "\n";
+            break;
+        }
+    }
+
+    // Parse and strip --num_gpus argument
+    int num_gpus = 0;
+    bool has_num_gpus_flag = false;
+    std::vector<int> indices_to_remove;  // Track all indices to remove
+
+    for(int i = 1; i < argc; i++)
+    {
+        std::string arg = argv[i];
+        if(arg.find("--num_gpus=") == 0)
+        {
+            num_gpus = std::atoi(arg.substr(11).c_str());
+            has_num_gpus_flag = true;
+            indices_to_remove.push_back(i);
+            break;
+        }
+        else if(arg == "--num_gpus" && i + 1 < argc)
+        {
+            num_gpus = std::atoi(argv[i + 1]);
+            has_num_gpus_flag = true;
+            indices_to_remove.push_back(i);
+            indices_to_remove.push_back(i + 1);
+            break;
+        }
+    }
+
+    // Parse and strip --gtest_output to extract base filename
+    std::string gtest_output_base;
+    for(int i = 1; i < argc; i++)
+    {
+        std::string arg = argv[i];
+        if(arg.find("--gtest_output=") == 0)
+        {
+            size_t colon_pos = arg.find(":");
+            if(colon_pos != std::string::npos)
+            {
+                // Format: --gtest_output=json:file.json
+                std::string format = arg.substr(15, colon_pos - 15);
+                std::string filename = arg.substr(colon_pos + 1);
+                gtest_output_base = format + ":" + filename;
+            }
+            else
+            {
+                // Format: --gtest_output=json (uses default filename)
+                std::string format = arg.substr(15);
+                if(format == "json")
+                {
+                    gtest_output_base = "json:test_detail.json"; // GTest default
+                }
+                else
+                {
+                    gtest_output_base = format; // Pass through other formats
+                }
+            }
+            indices_to_remove.push_back(i);
+            break;
+        }
+    }
+
+#ifdef _WIN32
+    // On Windows, parallel GPU execution is not supported
+    if(has_num_gpus_flag)
+    {
+        hipblaslt_cerr << "Error: --num_gpus is not supported on Windows." << std::endl;
+        return 1;
+    }
+#else
+    // Check for invalid --num_gpus values
+    if(has_num_gpus_flag && num_gpus <= 1)
+    {
+        hipblaslt_cerr << "Error: --num_gpus requires a value greater than 1." << std::endl;
+        return 1;
+    }
+
+    // If parallel GPUs requested, use parallel execution
+    if(num_gpus > 1)
+    {
+        // Remove custom flags from argv before passing to parallel runner
+        // Sort indices in descending order to remove from back to front
+        std::sort(indices_to_remove.begin(), indices_to_remove.end(), std::greater<int>());
+        for(int idx : indices_to_remove)
+        {
+            for(int i = idx; i + 1 < argc; i++)
+            {
+                argv[i] = argv[i + 1];
+            }
+            argc--;
+        }
+
+        return run_tests_parallel_gpus(argc, argv, num_gpus, gtest_output_base);
+    }
+#endif
+
     // Set signal handler
     hipblaslt_test_sigaction();
 
@@ -256,7 +380,7 @@ int main(int argc, char** argv)
     // Failures printed at end for reporting so repeat version info
     hipblaslt_print_version();
 
-    // end test results with command line
+    // Print command line at the end
     hipblaslt_print_args(args);
 
     //hipblaslt_shutdown();
diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp
new file mode 100644
index 000000000000..ab3a20105a55
--- /dev/null
+++ b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.cpp
@@ -0,0 +1,419 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2022-2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "hipblaslt_parallel_test.hpp"
+#include "utility.hpp"
+
+#ifndef _WIN32
+#include <vector>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <thread>
+#include <cstdio>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <hip/hip_runtime.h>
+
+// Run a single GPU shard as a child process
+// This function sets up env vars, redirects output, and execs the test binary
+[[noreturn]] void run_child_shard(int argc, char** argv, int gpu, int num_gpus,
+                                  const std::string& log_file,
+                                  const std::string& gtest_output_base)
+{
+    // Set which GPU to use
+    std::string gpu_env = std::to_string(gpu);
+    setenv("HIP_VISIBLE_DEVICES", gpu_env.c_str(), 1);
+
+    // Set optimal OpenMP threads per GPU process
+    const char* env_threads = getenv("OMP_NUM_THREADS");
+    int current_threads = env_threads ? std::atoi(env_threads) : std::thread::hardware_concurrency();
+    int threads_per_gpu = std::max(1, current_threads / num_gpus);
+    setenv("OMP_NUM_THREADS", std::to_string(threads_per_gpu).c_str(), 1);
+
+    // Use Google Test's built-in sharding
+    setenv("GTEST_TOTAL_SHARDS", std::to_string(num_gpus).c_str(), 1);
+    setenv("GTEST_SHARD_INDEX", std::to_string(gpu).c_str(), 1);
+
+    // Redirect output to log file to avoid interleaved output
+    int fd = open(log_file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    if(fd >= 0)
+    {
+        dup2(fd, STDOUT_FILENO);
+        dup2(fd, STDERR_FILENO);
+        close(fd);
+    }
+
+    // Build argv - argv is already clean (custom flags removed by main)
+    std::vector<const char*> new_argv;
+    std::vector<std::string> arg_storage; // Store modified arguments
+
+    new_argv.push_back(argv[0]);
+
+    // Pass through all arguments from the clean argv
+    for(int i = 1; i < argc; i++)
+    {
+        new_argv.push_back(argv[i]);
+    }
+
+    // Add per-GPU gtest_output if it was specified
+    if(!gtest_output_base.empty())
+    {
+        size_t colon_pos = gtest_output_base.find(":");
+        if(colon_pos != std::string::npos)
+        {
+            // Format: json:file.json
+            std::string format = gtest_output_base.substr(0, colon_pos);
+            std::string filename = gtest_output_base.substr(colon_pos + 1);
+
+            // Insert GPU number before file extension
+            size_t dot_pos = filename.rfind(".");
+            std::string new_filename;
+            if(dot_pos != std::string::npos)
+            {
+                new_filename = filename.substr(0, dot_pos) + "_gpu" +
+                             std::to_string(gpu) + filename.substr(dot_pos);
+            }
+            else
+            {
+                new_filename = filename + "_gpu" + std::to_string(gpu);
+            }
+
+            std::string new_output_arg = "--gtest_output=" + format + ":" + new_filename;
+            arg_storage.push_back(new_output_arg);
+            new_argv.push_back(arg_storage.back().c_str());
+        }
+        else
+        {
+            // Format without colon, just pass through with default filename
+            std::string new_output_arg = "--gtest_output=" + gtest_output_base;
+            arg_storage.push_back(new_output_arg);
+            new_argv.push_back(arg_storage.back().c_str());
+        }
+    }
+
+    new_argv.push_back(nullptr);
+
+    // Execute the test binary
+    execvp(argv[0], const_cast<char* const*>(new_argv.data()));
+
+    // If exec fails
+    hipblaslt_cerr << "Failed to exec for GPU " << gpu << std::endl;
+    exit(1);
+}
+
+// Function to run tests in parallel across multiple GPUs
+// argc/argv should already have --num_gpus and --gtest_output stripped
+int run_tests_parallel_gpus(int argc, char** argv, int num_gpus,
+                            const std::string& gtest_output_base)
+{
+    hipblaslt_cout << "\n========================================" << std::endl;
+    hipblaslt_cout << "Parallel GPU Execution Mode" << std::endl;
+    hipblaslt_cout << "Running tests across " << num_gpus << " GPUs" << std::endl;
+    hipblaslt_cout << "========================================\n" << std::endl;
+
+    // Check available GPUs
+    int available_gpus = 0;
+    if(hipGetDeviceCount(&available_gpus) != hipSuccess || available_gpus < 1)
+    {
+        hipblaslt_cerr << "Error: No GPUs detected" << std::endl;
+        return 1;
+    }
+
+    if(num_gpus > available_gpus)
+    {
+        hipblaslt_cerr << "Warning: Requested " << num_gpus << " GPUs but only "
+                       << available_gpus << " available. Using " << available_gpus << " GPUs."
+                       << std::endl;
+        num_gpus = available_gpus;
+    }
+
+    // Display sharding information
+    hipblaslt_cout << "Tests will be sharded across " << num_gpus << " GPUs" << std::endl;
+
+    // Calculate and display OpenMP thread distribution
+    const char* env_threads = getenv("OMP_NUM_THREADS");
+    int current_threads = env_threads ? std::atoi(env_threads) : std::thread::hardware_concurrency();
+    int threads_per_gpu = std::max(1, current_threads / num_gpus);
+    hipblaslt_cout << "OpenMP threads per GPU: " << threads_per_gpu
+                   << " (total available: " << current_threads << ")" << std::endl;
+    hipblaslt_cout << std::endl;
+
+    // Split tests across GPUs using Google Test's built-in sharding
+    std::vector<pid_t> child_pids;
+    std::vector<std::string> output_files;
+    bool fork_failed = false;
+
+    for(int gpu = 0; gpu < num_gpus; gpu++)
+    {
+        std::string output_file = "/tmp/hipblaslt_gpu" + std::to_string(gpu) + "_" +
+                                  std::to_string(getpid()) + ".log";
+        output_files.push_back(output_file);
+
+        hipblaslt_cout << "GPU " << gpu << ": Starting shard " << gpu << std::endl;
+
+        pid_t pid = fork();
+        if(pid == 0)
+        {
+            // Child process
+            run_child_shard(argc, argv, gpu, num_gpus, output_file, gtest_output_base);
+        }
+        else if(pid > 0)
+        {
+            // Parent process
+            child_pids.push_back(pid);
+        }
+        else
+        {
+            // Fork failed - need to clean up already-forked children
+            hipblaslt_cerr << "Error: Failed to fork for GPU " << gpu << std::endl;
+            fork_failed = true;
+            break;
+        }
+    }
+
+    // If fork failed, terminate and wait for any already-started children
+    if(fork_failed)
+    {
+        hipblaslt_cerr << "Terminating " << child_pids.size() << " already-started child processes..." << std::endl;
+
+        // Send SIGTERM to all children
+        for(pid_t pid : child_pids)
+        {
+            kill(pid, SIGTERM);
+        }
+
+        // Wait for all children to terminate
+        for(pid_t pid : child_pids)
+        {
+            int status;
+            waitpid(pid, &status, 0);
+        }
+
+        return 1;
+    }
+
+    // Wait for all children and collect results (without printing yet)
+    hipblaslt_cout << "\nWaiting for all GPUs to complete..." << std::endl;
+
+    int total_exit_code = 0;
+    int gpus_passed     = 0;
+    int gpus_failed     = 0;
+    int total_tests_ran    = 0;
+    int total_tests_passed = 0;
+    int total_tests_failed = 0;
+    std::vector<int> exit_codes(child_pids.size());
+    std::vector<bool> normal_exit(child_pids.size());
+    std::vector<std::string> gpu_summaries(child_pids.size());
+    std::vector<int> gpu_tests_ran(child_pids.size(), 0);
+    std::vector<int> gpu_tests_passed(child_pids.size(), 0);
+    std::vector<int> gpu_tests_failed(child_pids.size(), 0);
+    std::vector<double> gpu_time_ms(child_pids.size(), 0.0);
+
+    // Wait for ALL children first before printing any results
+    for(size_t i = 0; i < child_pids.size(); i++)
+    {
+        int status;
+        waitpid(child_pids[i], &status, 0);
+
+        if(WIFEXITED(status))
+        {
+            normal_exit[i] = true;
+            exit_codes[i]  = WEXITSTATUS(status);
+            if(exit_codes[i] == 0)
+            {
+                gpus_passed++;
+            }
+            else
+            {
+                gpus_failed++;
+                total_exit_code = 1;
+            }
+        }
+        else
+        {
+            normal_exit[i] = false;
+            exit_codes[i]  = -1;
+            gpus_failed++;
+            total_exit_code = 1;
+        }
+
+        // Extract summary from log file and parse test counts
+        std::ifstream log_file(output_files[i]);
+        if(log_file.is_open())
+        {
+            std::string line;
+            std::vector<std::string> summary_lines;
+            bool in_summary = false;
+
+            while(std::getline(log_file, line))
+            {
+                // Parse total tests ran: "[==========] 87 tests from 1 test suite ran."
+                if(line.find("[==========]") != std::string::npos && line.find("tests") != std::string::npos && line.find("ran.") != std::string::npos)
+                {
+                    in_summary = true;
+                    // Extract number of tests
+                    size_t pos = line.find("]");
+                    if(pos != std::string::npos)
+                    {
+                        std::istringstream iss(line.substr(pos + 1));
+                        iss >> gpu_tests_ran[i];
+                    }
+                }
+
+                // Parse passed tests: "[  PASSED  ] 87 tests."
+                if(line.find("[  PASSED  ]") != std::string::npos && line.find("tests") != std::string::npos)
+                {
+                    size_t pos = line.find("]");
+                    if(pos != std::string::npos)
+                    {
+                        std::istringstream iss(line.substr(pos + 1));
+                        iss >> gpu_tests_passed[i];
+                    }
+                }
+
+                // Parse failed tests: "[  FAILED  ] 2 tests, listed below:"
+                if(line.find("[  FAILED  ]") != std::string::npos && line.find("tests") != std::string::npos)
+                {
+                    size_t pos = line.find("]");
+                    if(pos != std::string::npos)
+                    {
+                        std::istringstream iss(line.substr(pos + 1));
+                        iss >> gpu_tests_failed[i];
+                    }
+                }
+
+                // Parse time: "[==========] 87 tests from 1 test suite ran. (1234 ms total)"
+                if(line.find("[==========]") != std::string::npos && line.find("ms total") != std::string::npos)
+                {
+                    size_t pos = line.find("(");
+                    if(pos != std::string::npos)
+                    {
+                        std::istringstream iss(line.substr(pos + 1));
+                        iss >> gpu_time_ms[i];
+                    }
+                }
+
+                if(in_summary)
+                {
+                    summary_lines.push_back(line);
+                }
+            }
+            log_file.close();
+
+            // Store the summary for this GPU
+            if(!summary_lines.empty())
+            {
+                for(const auto& summary_line : summary_lines)
+                {
+                    gpu_summaries[i] += summary_line + "\n";
+                }
+            }
+
+            // Accumulate totals
+            total_tests_ran += gpu_tests_ran[i];
+            total_tests_passed += gpu_tests_passed[i];
+            total_tests_failed += gpu_tests_failed[i];
+        }
+    }
+
+    // Now print all results together
+    hipblaslt_cout << "\n========================================" << std::endl;
+    hipblaslt_cout << "Parallel GPU Test Summary" << std::endl;
+    hipblaslt_cout << "========================================\n" << std::endl;
+
+    // Print individual GPU results with their GTest summaries
+    for(size_t i = 0; i < child_pids.size(); i++)
+    {
+        hipblaslt_cout << "GPU " << i << ":" << std::endl;
+
+        if(!gpu_summaries[i].empty())
+        {
+            hipblaslt_cout << gpu_summaries[i];
+        }
+        else if(normal_exit[i])
+        {
+            hipblaslt_cout << "  Exit code: " << exit_codes[i] << std::endl;
+        }
+        else
+        {
+            hipblaslt_cout << "  Terminated abnormally" << std::endl;
+        }
+        hipblaslt_cout << std::endl;
+    }
+
+    // Calculate average time
+    double total_time_ms = 0.0;
+    int valid_times = 0;
+    for(size_t i = 0; i < gpu_time_ms.size(); i++)
+    {
+        if(gpu_time_ms[i] > 0.0)
+        {
+            total_time_ms += gpu_time_ms[i];
+            valid_times++;
+        }
+    }
+    double avg_time_ms = (valid_times > 0) ? (total_time_ms / valid_times) : 0.0;
+
+    hipblaslt_cout << "----------------------------------------" << std::endl;
+    hipblaslt_cout << "OVERALL SUMMARY (across all GPUs):" << std::endl;
+    hipblaslt_cout << "Total tests run:  " << total_tests_ran << std::endl;
+    hipblaslt_cout << "Total PASSED:     " << total_tests_passed << std::endl;
+    hipblaslt_cout << "Total FAILED:     " << total_tests_failed << std::endl;
+    if(avg_time_ms > 0.0)
+    {
+        hipblaslt_cout << "Average time:     " << avg_time_ms << " ms" << std::endl;
+    }
+    hipblaslt_cout << "\nGPU Summary:" << std::endl;
+    hipblaslt_cout << "GPUs used:        " << num_gpus << std::endl;
+    hipblaslt_cout << "GPUs passed:      " << gpus_passed << std::endl;
+    hipblaslt_cout << "GPUs failed:      " << gpus_failed << std::endl;
+
+    // Clean up log files on success to avoid accumulation in /tmp on CI systems
+    if(total_exit_code == 0)
+    {
+        hipblaslt_cout << "\nAll tests passed - cleaning up log files" << std::endl;
+        for(size_t i = 0; i < output_files.size(); i++)
+        {
+            remove(output_files[i].c_str());
+        }
+    }
+    else
+    {
+        hipblaslt_cout << "\nLog files saved in:" << std::endl;
+        for(size_t i = 0; i < output_files.size(); i++)
+        {
+            hipblaslt_cout << "  GPU " << i << ": " << output_files[i] << std::endl;
+        }
+    }
+    hipblaslt_cout << "========================================\n" << std::endl;
+
+    return total_exit_code;
+}
+#endif // _WIN32
diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp
new file mode 100644
index 000000000000..b96a8fb0f39b
--- /dev/null
+++ b/projects/hipblaslt/clients/tests/src/hipblaslt_parallel_test.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2022-2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#ifndef _WIN32
+// Run tests in parallel across multiple GPUs using process-level parallelism
+// and Google Test's built-in sharding mechanism
+// argc/argv should already have --num_gpus and --gtest_output stripped
+// gtest_output_base is the parsed output filename (empty if not specified)
+int run_tests_parallel_gpus(int argc, char** argv, int num_gpus,
+                            const std::string& gtest_output_base);
+#endif
diff --git a/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp b/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp
index a8538ea957f3..84522a9bbaf2 100644
--- a/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp
+++ b/projects/hipblaslt/clients/tests/src/hipblaslt_test.cpp
@@ -145,6 +145,22 @@ static thread_local struct
     volatile sig_atomic_t signal;
 } t_handler;
 
+// Test timeout configuration - initialized from environment variables
+static const struct
+{
+    unsigned seconds;        // Timeout duration in seconds (from HIPBLASLT_TEST_TIMEOUT)
+    bool     continue_tests; // Continue to next test on timeout vs abort (from HIPBLASLT_TEST_TIMEOUT_CONTINUE)
+} test_timeout_config = [] {
+    constexpr unsigned DEFAULT_TIMEOUT = 600;
+    const char*        timeout_env      = getenv("HIPBLASLT_TEST_TIMEOUT");
+    unsigned           timeout_val;
+
+    return decltype(test_timeout_config){
+        (timeout_env && sscanf(timeout_env, "%u", &timeout_val) == 1) ? timeout_val
+                                                                        : DEFAULT_TIMEOUT,
+        getenv("HIPBLASLT_TEST_TIMEOUT_CONTINUE") != nullptr};
+}();
+
 // Signal handler (must have external "C" linkage)
 extern "C" void hipblaslt_test_signal_handler(int sig)
 {
@@ -162,16 +178,22 @@ extern "C" void hipblaslt_test_signal_handler(int sig)
     }
 
 #ifndef _WIN32
-    // If this is an alarm timeout, we abort
+    // If this is an alarm timeout, check if we should abort or continue
     if(sig == SIGALRM)
     {
-        static constexpr char msg[]
-            = "\nAborting tests due to an alarm timeout.\n\n"
-              "This could be due to a deadlock caused by mutexes being left locked\n"
-              "after a previous test's signal was caught and partially recovered from.\n";
-        // We must use write() because it's async-signal-safe and other IO might be blocked
-        write(STDERR_FILENO, msg, sizeof(msg) - 1);
-        hipblaslt_abort();
+        if(!test_timeout_config.continue_tests)
+        {
+            // Original behavior: abort entire test run on timeout
+            static constexpr char msg[]
+                = "\nAborting tests due to an alarm timeout.\n\n"
+                  "This could be due to a deadlock caused by mutexes being left locked\n"
+                  "after a previous test's signal was caught and partially recovered from.\n"
+                  "Set HIPBLASLT_TEST_TIMEOUT_CONTINUE=1 to continue with remaining tests instead.\n";
+            // We must use write() because it's async-signal-safe and other IO might be blocked
+            write(STDERR_FILENO, msg, sizeof(msg) - 1);
+            hipblaslt_abort();
+        }
+        // If continue_tests is true, fall through to treat like other signals
     }
 #endif
 
@@ -205,14 +227,6 @@ void hipblaslt_test_sigaction()
 #endif
 }
 
-static const unsigned test_timeout = [] {
-    // Number of seconds each test is allowed to take before all testing is killed.
-    constexpr unsigned TEST_TIMEOUT = 600;
-    unsigned           timeout;
-    const char*        env = getenv("HIPBLASLT_TEST_TIMEOUT");
-    return env && sscanf(env, "%u", &timeout) == 1 ? timeout : TEST_TIMEOUT;
-}();
-
 // Lambda wrapper which detects signals and exceptions in an invokable function
 void catch_signals_and_exceptions_as_failures(std::function<void()> test, bool set_alarm)
 {
@@ -223,11 +237,19 @@ void catch_signals_and_exceptions_as_failures(std::function<void()> test, bool s
     // Set up the return point, and handle siglongjmp returning back to here
     if(sigsetjmp(t_handler.sigjmp_buf_, true))
     {
+        // Provide clear message for timeout vs other signals
+        if(t_handler.signal == SIGALRM)
+        {
+            FAIL() << "Test exceeded timeout of " << test_timeout_config.seconds << " seconds";
+        }
+        else
+        {
 #if (__GLIBC__ < 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 32)
-        FAIL() << "Received " << sys_siglist[t_handler.signal] << " signal";
+            FAIL() << "Received " << sys_siglist[t_handler.signal] << " signal";
 #else
-        FAIL() << "Received " << sigdescr_np(t_handler.signal) << " signal";
+            FAIL() << "Received " << sigdescr_np(t_handler.signal) << " signal";
 #endif
+        }
     }
 #else
     if(setjmp(t_handler.sigjmp_buf_))
@@ -240,7 +262,7 @@ void catch_signals_and_exceptions_as_failures(std::function<void()> test, bool s
 #ifndef _WIN32
         // Alarm to detect deadlocks or hangs
         if(set_alarm)
-            alarm(test_timeout);
+            alarm(test_timeout_config.seconds);
 #endif
         // Enable the signal handler
         t_handler.enabled = true;