diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index b14832795ac..da050518ac8 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -667,9 +667,11 @@ def trt_gpu_clock_lock(request): gpu_list = get_gpu_device_list() gpu_ids = [gpu.split()[1][:-1] for gpu in gpu_list] # Extract GPU IDs gpu_ids_str = ",".join(gpu_ids) + enable_clock_locking = request.config.getoption("--enable-gpu-clock-lock") gpu_clock_lock = GPUClockLock( gpu_id=gpu_ids_str, interval_ms=1000.0, + enable_clock_locking=enable_clock_locking, ) yield gpu_clock_lock @@ -2138,6 +2140,13 @@ def pytest_addoption(parser): help="Path to the output XML file for periodic JUnit XML reporter. " "Only used with --periodic-junit.", ) + parser.addoption( + "--enable-gpu-clock-lock", + action="store_true", + default=False, + help="Enable GPU clock locking during tests. " + "By default, GPU clock locking is disabled.", + ) @pytest.hookimpl(trylast=True) diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py index 33aa34b0d70..5be10d24674 100644 --- a/tests/integration/defs/perf/gpu_clock_lock.py +++ b/tests/integration/defs/perf/gpu_clock_lock.py @@ -67,7 +67,7 @@ def __init__(self, gpu_id, gpu_clock, mem_clock, timestamp, graphics_clk, class GPUClockLock: - def __init__(self, gpu_id, interval_ms): + def __init__(self, gpu_id, interval_ms, enable_clock_locking=False): """ Sets up clock values and tears down every run. At the end of the session call teardown to complete session and reset GPU clocks. @@ -75,6 +75,7 @@ def __init__(self, gpu_id, interval_ms): Args: gpu_id (str): GPU identifier, either comma-separated UUIDs or comma-separated indices in string. interval_ms (float): Interval duration between monitoring samples. + enable_clock_locking (bool): If True, enable GPU clock locking. Default is False. """ # Initialize pynvml self._nvml_initialized = False @@ -84,6 +85,7 @@ def __init__(self, gpu_id, interval_ms): self._gpu_id = gpu_id self._gpu_id_list = [int(id) for id in gpu_id.split(",")] self._mobile_disable_clock_locking = False + self._enable_clock_locking = enable_clock_locking # Create GPU handles, one per GPU. try: @@ -207,6 +209,10 @@ def _lock_gpu_clocks(self): Implements fail-fast semantics: if any GPU fails to lock, all operations are rolled back and an exception is raised. """ + if not self._enable_clock_locking: + print_warning("Clock locking is not enabled inside TRTLLM code") + return + if self._mobile_disable_clock_locking: print_info("Clock locking disabled for mobile/Jetson devices") return @@ -256,12 +262,20 @@ def _lock_gpu_clocks(self): f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz" ) except pynvml.NVMLError as e: - print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}") # Rollback any GPUs that were successfully locked self._rollback_locked_gpus(locked_gpus, original_clocks_backup) - raise GPUClockLockFailFastError( - f"Failed to lock clocks for GPU {gpu_idx}: {e}") + + # Only raise GPUClockLockFailFastError for non-permission errors + if isinstance(e, pynvml.NVMLError_NoPermission): + print_warning( + f"Permission denied while locking GPU {gpu_idx}, continuing: {e}" + ) + else: + print_error( + f"Failed to lock clocks for GPU {gpu_idx}: {e}") + raise GPUClockLockFailFastError( + f"Failed to lock clocks for GPU {gpu_idx}: {e}") # Phase 3: Only mark as locked if all GPUs succeeded self._original_clocks = original_clocks_backup @@ -421,6 +435,11 @@ def validate_gpu_monitoring_data(self, deviation_perc=0.07, num_entries=3): before considering the entire dataset as invalid """ + if not self._enable_clock_locking: + print_info( + "Skipped gpu monitoring validation (clock locking not enabled)") + return + if self._mobile_disable_clock_locking: print_info("Skipped gpu monitoring validation for mobile board") return