From afbecdb4c7d13ad9fa9e7e73dc3d3ecdc8464041 Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:37:44 +0000 Subject: [PATCH 1/2] lock gpu clock Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- tests/integration/defs/perf/gpu_clock_lock.py | 101 +++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py index 56873565848..a39a3095c39 100644 --- a/tests/integration/defs/perf/gpu_clock_lock.py +++ b/tests/integration/defs/perf/gpu_clock_lock.py @@ -111,6 +111,10 @@ def __init__(self, gpu_id, interval_ms): self._is_monitoring = False self._state_data = [] + # Fields for clock locking + self._original_clocks = {} + self._clocks_locked = False + def get_os_properties(self): return self._os_properties @@ -136,9 +140,92 @@ def get_target_gpu_clocks(self): """ Get the target GPU clocks (sm_clk and mem_clk) for the first GPU in the list. """ - # We don't set gpu clock currently, so let it return None. + if self._gpu_handles and len(self._gpu_handles) > 0: + try: + # Get maximum supported clocks for the first GPU + handle = self._gpu_handles[0] + max_sm_clk = pynvml.nvmlDeviceGetMaxClockInfo( + handle, pynvml.NVML_CLOCK_SM) + max_mem_clk = pynvml.nvmlDeviceGetMaxClockInfo( + handle, pynvml.NVML_CLOCK_MEM) + return (max_sm_clk, max_mem_clk) + except pynvml.NVMLError as e: + print_warning(f"Failed to get max clock info: {e}") + return None return None + def _lock_gpu_clocks(self): + """ + Lock GPU clocks to maximum supported frequencies for consistent performance. + """ + if self._mobile_disable_clock_locking: + print_info("Clock locking disabled for mobile/Jetson devices") + return + + if not self._gpu_handles: + print_warning("No GPU handles available for clock locking") + return + + target_clocks = self.get_target_gpu_clocks() + if not target_clocks: + print_warning("Could not determine target GPU clocks") + return + + target_sm_clk, target_mem_clk = target_clocks + + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + # Store original clocks for restoration later + original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock( + handle, pynvml.NVML_CLOCK_SM) + original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock( + handle, pynvml.NVML_CLOCK_MEM) + self._original_clocks[gpu_idx] = (original_sm_clk, + original_mem_clk) + + # Set application clocks to maximum supported values + pynvml.nvmlDeviceSetApplicationsClocks(handle, target_mem_clk, + target_sm_clk) + print_info( + f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz" + ) + + except pynvml.NVMLError as e: + print_warning(f"Failed to lock clocks for GPU {gpu_idx}: {e}") + # Try to continue with other GPUs + continue + + self._clocks_locked = True + + def _unlock_gpu_clocks(self): + """ + Restore GPU clocks to their original values. + """ + if not self._clocks_locked or not self._gpu_handles: + return + + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + if gpu_idx in self._original_clocks: + original_sm_clk, original_mem_clk = self._original_clocks[ + gpu_idx] + pynvml.nvmlDeviceSetApplicationsClocks( + handle, original_mem_clk, original_sm_clk) + print_info( + f"GPU {gpu_idx}: Restored clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" + ) + else: + # Reset to default clocks if we don't have original values + pynvml.nvmlDeviceResetApplicationsClocks(handle) + print_info(f"GPU {gpu_idx}: Reset clocks to default") + + except pynvml.NVMLError as e: + print_warning( + f"Failed to restore clocks for GPU {gpu_idx}: {e}") + + self._clocks_locked = False + self._original_clocks = {} + def __enter__(self): """ Do all the steps needed at the start of a test case: @@ -154,6 +241,10 @@ def __enter__(self): for gpu_id in self._gpu_id_list ] print_info(f"Reinitialized GPU handles: {self._gpu_handles}") + + # Lock GPU clocks for consistent performance + self._lock_gpu_clocks() + self.start_monitor() return self @@ -165,6 +256,10 @@ def __exit__(self, *args): - Validate gpu monitoring result. """ self.stop_monitor() + + # Restore original GPU clocks + self._unlock_gpu_clocks() + self.validate_gpu_monitoring_data() print_info("gpu clock lock exit!!!") @@ -233,8 +328,8 @@ def teardown(self): Call when the session finishes. Reset GPU clocks back to its original state. """ # Revert clocks back to normal if all tests have finished. - # Set current clock value back to session entry clock. - #self.release_clock() + self._unlock_gpu_clocks() + if self._nvml_initialized: pynvml.nvmlShutdown() self._nvml_initialized = False From fa7abefe3b2a3e87b57b71e2667174a7b8646bf5 Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:11:55 +0000 Subject: [PATCH 2/2] rollback on exception Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- tests/integration/defs/perf/gpu_clock_lock.py | 84 +++++++++++++++---- 1 file changed, 70 insertions(+), 14 deletions(-) diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py index a39a3095c39..99f19a4eb95 100644 --- a/tests/integration/defs/perf/gpu_clock_lock.py +++ b/tests/integration/defs/perf/gpu_clock_lock.py @@ -157,6 +157,9 @@ def get_target_gpu_clocks(self): def _lock_gpu_clocks(self): """ Lock GPU clocks to maximum supported frequencies for consistent performance. + + Implements fail-fast semantics: if any GPU fails to lock, all operations + are rolled back and an exception is raised. """ if self._mobile_disable_clock_locking: print_info("Clock locking disabled for mobile/Jetson devices") @@ -169,33 +172,86 @@ def _lock_gpu_clocks(self): target_clocks = self.get_target_gpu_clocks() if not target_clocks: print_warning("Could not determine target GPU clocks") - return + raise GPUClockLockFailFastError( + "Could not determine target GPU clocks") target_sm_clk, target_mem_clk = target_clocks + # Phase 1: Retrieve original clocks for all GPUs (fail-fast if any fails) + original_clocks_backup = {} for gpu_idx, handle in enumerate(self._gpu_handles): try: - # Store original clocks for restoration later original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock( handle, pynvml.NVML_CLOCK_SM) original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock( handle, pynvml.NVML_CLOCK_MEM) - self._original_clocks[gpu_idx] = (original_sm_clk, - original_mem_clk) - - # Set application clocks to maximum supported values - pynvml.nvmlDeviceSetApplicationsClocks(handle, target_mem_clk, - target_sm_clk) + original_clocks_backup[gpu_idx] = (original_sm_clk, + original_mem_clk) print_info( - f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz" + f"GPU {gpu_idx}: Retrieved original clocks SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" ) - except pynvml.NVMLError as e: - print_warning(f"Failed to lock clocks for GPU {gpu_idx}: {e}") - # Try to continue with other GPUs - continue + print_error( + f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}" + ) + raise GPUClockLockFailFastError( + f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}" + ) - self._clocks_locked = True + # Phase 2: Apply clock locks to all GPUs (fail-fast if any fails) + locked_gpus = [] + try: + for gpu_idx, handle in enumerate(self._gpu_handles): + try: + pynvml.nvmlDeviceSetApplicationsClocks( + handle, target_mem_clk, target_sm_clk) + locked_gpus.append(gpu_idx) + print_info( + f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz" + ) + except pynvml.NVMLError as e: + print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}") + # Rollback any GPUs that were successfully locked + self._rollback_locked_gpus(locked_gpus, + original_clocks_backup) + raise GPUClockLockFailFastError( + f"Failed to lock clocks for GPU {gpu_idx}: {e}") + + # Phase 3: Only mark as locked if all GPUs succeeded + self._original_clocks = original_clocks_backup + self._clocks_locked = True + print_info( + f"Successfully locked clocks on {len(locked_gpus)} GPU(s)") + + except Exception: + # Ensure we don't leave any GPUs in a locked state + if locked_gpus: + self._rollback_locked_gpus(locked_gpus, original_clocks_backup) + raise + + def _rollback_locked_gpus(self, locked_gpu_indices, original_clocks_backup): + """ + Rollback clock locks for specific GPUs to their original values. + + Args: + locked_gpu_indices: List of GPU indices that were successfully locked + original_clocks_backup: Dictionary of original clock values for each GPU + """ + for gpu_idx in locked_gpu_indices: + if gpu_idx < len( + self._gpu_handles) and gpu_idx in original_clocks_backup: + try: + handle = self._gpu_handles[gpu_idx] + original_sm_clk, original_mem_clk = original_clocks_backup[ + gpu_idx] + pynvml.nvmlDeviceSetApplicationsClocks( + handle, original_mem_clk, original_sm_clk) + print_info( + f"GPU {gpu_idx}: Rolled back clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz" + ) + except pynvml.NVMLError as e: + print_warning( + f"Failed to rollback clocks for GPU {gpu_idx}: {e}") def _unlock_gpu_clocks(self): """