diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc index bab4e9a60e2ed..fbd9b0a33c7c0 100644 --- a/onnxruntime/test/common/cuda_op_test_utils.cc +++ b/onnxruntime/test/common/cuda_op_test_utils.cc @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#ifdef USE_CUDA +#include + +#if defined(USE_CUDA) || defined(USE_NV) #include "cuda_runtime_api.h" #endif @@ -13,7 +15,7 @@ int GetCudaArchitecture() { // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result. static int cuda_arch = -1; -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_NV) if (cuda_arch == -1) { int current_device_id = 0; cudaGetDevice(¤t_device_id); @@ -26,6 +28,15 @@ int GetCudaArchitecture() { if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) { cuda_arch = prop.major * 100 + prop.minor * 10; } + + // Log GPU compute capability + if (cuda_arch == -1) { + std::cout << "WARNING: CUDA is not available or failed to initialize" << std::endl; + } else { + std::cout << "GPU Compute Capability: SM " + << cuda_arch / 100 << "." << (cuda_arch % 100) / 10 + << " (value: " << cuda_arch << ")" << std::endl; + } } #endif diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc index 5ae610a842679..1a987ab4f411a 100644 --- a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc +++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc @@ -9,6 +9,7 @@ #include "test/util/include/scoped_env_vars.h" #include "test/common/trt_op_test_utils.h" #include "test/common/random_generator.h" +#include "test/common/cuda_op_test_utils.h" #include "test/providers/nv_tensorrt_rtx/test_nv_trt_rtx_ep_util.h" #include @@ -22,6 +23,21 @@ namespace onnxruntime { namespace test { +// Helper function to check if GPU is Blackwell (SM 12.0+) or above +// Returns true if requirement is met +// Returns false if CUDA is unavailable or GPU is below SM 12.0 +static bool IsBlackwellOrAbove() { + constexpr int kBlackwellMinCapability = 1200; // SM 12.0 = 12 * 100 + 0 * 10 + int cuda_arch = GetCudaArchitecture(); + + // Check if CUDA is available + if (cuda_arch == -1) { + return false; + } + + return cuda_arch >= kBlackwellMinCapability; +} + TEST(NvExecutionProviderTest, ContextEmbedAndReload) { PathString model_name = ORT_TSTR("nv_execution_provider_test.onnx"); PathString model_name_ctx = ORT_TSTR("nv_execution_provider_test_ctx.onnx"); @@ -442,6 +458,10 @@ TEST(NvExecutionProviderTest, DataTransfer) { } TEST(NvExecutionProviderTest, FP8CustomOpModel) { + if (!IsBlackwellOrAbove()) { + GTEST_SKIP() << "Test requires SM 12.0+ GPU (Blackwell+)"; + } + PathString model_name = ORT_TSTR("nv_execution_provider_fp8_quantize_dequantize_test.onnx"); clearFileIfExists(model_name); std::string graph_name = "nv_execution_provider_fp8_quantize_dequantize_graph"; @@ -509,6 +529,10 @@ TEST(NvExecutionProviderTest, FP8CustomOpModel) { } TEST(NvExecutionProviderTest, FP4CustomOpModel) { + if (!IsBlackwellOrAbove()) { + GTEST_SKIP() << "Test requires SM 12.0+ GPU (Blackwell+)"; + } + PathString model_name = ORT_TSTR("nv_execution_provider_fp4_dynamic_quantize_test.onnx"); clearFileIfExists(model_name); std::string graph_name = "nv_execution_provider_fp4_dynamic_quantize_graph";