diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py index 56873565848..99f19a4eb95 100644 --- a/tests/integration/defs/perf/gpu_clock_lock.py +++ b/tests/integration/defs/perf/gpu_clock_lock.py @@ -111,6 +111,10 @@ def __init__(self, gpu_id, interval_ms): self._is_monitoring = False self._state_data = [] + # Fields for clock locking + self._original_clocks = {} + self._clocks_locked = False + def get_os_properties(self): return self._os_properties @@ -136,9 +140,148 @@ def get_target_gpu_clocks(self): """ Get the target GPU clocks (sm_clk and mem_clk) for the first GPU in the list. """ - # We don't set gpu clock currently, so let it return None. + if self._gpu_handles and len(self._gpu_handles) > 0: + try: + # Get maximum supported clocks for the first GPU + handle = self._gpu_handles[0] + max_sm_clk = pynvml.nvmlDeviceGetMaxClockInfo( + handle, pynvml.NVML_CLOCK_SM) + max_mem_clk = pynvml.nvmlDeviceGetMaxClockInfo( + handle, pynvml.NVML_CLOCK_MEM) + return (max_sm_clk, max_mem_clk) + except pynvml.NVMLError as e: + print_warning(f"Failed to get max clock info: {e}") + return None return None + def _lock_gpu_clocks(self): + """ + Lock GPU clocks to maximum supported frequencies for consistent performance. + + Implements fail-fast semantics: if any GPU fails to lock, all operations + are rolled back and an exception is raised. + """ + if self._mobile_disable_clock_locking: + print_info("Clock locking disabled for mobile/Jetson devices") + return + + if not self._gpu_handles: + print_warning("No GPU handles available for clock locking") + return + + target_clocks = self.get_target_gpu_clocks() + if not target_clocks: + print_warning("Could not determine target GPU clocks") + raise GPUClockLockFailFastError( + "Could not determine target GPU clocks") + + target_sm_clk, target_mem_clk = target_clocks + + # Phase 1: Retrieve original clocks for all GPUs (fail-fast if any fails) + original_clocks_backup = {} + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock( + handle, pynvml.NVML_CLOCK_SM) + original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock( + handle, pynvml.NVML_CLOCK_MEM) + original_clocks_backup[gpu_idx] = (original_sm_clk, + original_mem_clk) + print_info( + f"GPU {gpu_idx}: Retrieved original clocks SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" + ) + except pynvml.NVMLError as e: + print_error( + f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}" + ) + raise GPUClockLockFailFastError( + f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}" + ) + + # Phase 2: Apply clock locks to all GPUs (fail-fast if any fails) + locked_gpus = [] + try: + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + pynvml.nvmlDeviceSetApplicationsClocks( + handle, target_mem_clk, target_sm_clk) + locked_gpus.append(gpu_idx) + print_info( + f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz" + ) + except pynvml.NVMLError as e: + print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}") + # Rollback any GPUs that were successfully locked + self._rollback_locked_gpus(locked_gpus, + original_clocks_backup) + raise GPUClockLockFailFastError( + f"Failed to lock clocks for GPU {gpu_idx}: {e}") + + # Phase 3: Only mark as locked if all GPUs succeeded + self._original_clocks = original_clocks_backup + self._clocks_locked = True + print_info( + f"Successfully locked clocks on {len(locked_gpus)} GPU(s)") + + except Exception: + # Ensure we don't leave any GPUs in a locked state + if locked_gpus: + self._rollback_locked_gpus(locked_gpus, original_clocks_backup) + raise + + def _rollback_locked_gpus(self, locked_gpu_indices, original_clocks_backup): + """ + Rollback clock locks for specific GPUs to their original values. + + Args: + locked_gpu_indices: List of GPU indices that were successfully locked + original_clocks_backup: Dictionary of original clock values for each GPU + """ + for gpu_idx in locked_gpu_indices: + if gpu_idx < len( + self._gpu_handles) and gpu_idx in original_clocks_backup: + try: + handle = self._gpu_handles[gpu_idx] + original_sm_clk, original_mem_clk = original_clocks_backup[ + gpu_idx] + pynvml.nvmlDeviceSetApplicationsClocks( + handle, original_mem_clk, original_sm_clk) + print_info( + f"GPU {gpu_idx}: Rolled back clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" + ) + except pynvml.NVMLError as e: + print_warning( + f"Failed to rollback clocks for GPU {gpu_idx}: {e}") + + def _unlock_gpu_clocks(self): + """ + Restore GPU clocks to their original values. + """ + if not self._clocks_locked or not self._gpu_handles: + return + + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + if gpu_idx in self._original_clocks: + original_sm_clk, original_mem_clk = self._original_clocks[ + gpu_idx] + pynvml.nvmlDeviceSetApplicationsClocks( + handle, original_mem_clk, original_sm_clk) + print_info( + f"GPU {gpu_idx}: Restored clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" + ) + else: + # Reset to default clocks if we don't have original values + pynvml.nvmlDeviceResetApplicationsClocks(handle) + print_info(f"GPU {gpu_idx}: Reset clocks to default") + + except pynvml.NVMLError as e: + print_warning( + f"Failed to restore clocks for GPU {gpu_idx}: {e}") + + self._clocks_locked = False + self._original_clocks = {} + def __enter__(self): """ Do all the steps needed at the start of a test case: @@ -154,6 +297,10 @@ def __enter__(self): for gpu_id in self._gpu_id_list ] print_info(f"Reinitialized GPU handles: {self._gpu_handles}") + + # Lock GPU clocks for consistent performance + self._lock_gpu_clocks() + self.start_monitor() return self @@ -165,6 +312,10 @@ def __exit__(self, *args): - Validate gpu monitoring result. """ self.stop_monitor() + + # Restore original GPU clocks + self._unlock_gpu_clocks() + self.validate_gpu_monitoring_data() print_info("gpu clock lock exit!!!") @@ -233,8 +384,8 @@ def teardown(self): Call when the session finishes. Reset GPU clocks back to its original state. """ # Revert clocks back to normal if all tests have finished. - # Set current clock value back to session entry clock. - #self.release_clock() + self._unlock_gpu_clocks() + if self._nvml_initialized: pynvml.nvmlShutdown() self._nvml_initialized = False