Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 154 additions & 3 deletions tests/integration/defs/perf/gpu_clock_lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def __init__(self, gpu_id, interval_ms):
self._is_monitoring = False
self._state_data = []

# Fields for clock locking
self._original_clocks = {}
self._clocks_locked = False

def get_os_properties(self):
return self._os_properties

Expand All @@ -136,9 +140,148 @@ def get_target_gpu_clocks(self):
"""
Get the target GPU clocks (sm_clk and mem_clk) for the first GPU in the list.
"""
# We don't set gpu clock currently, so let it return None.
if self._gpu_handles and len(self._gpu_handles) > 0:
try:
# Get maximum supported clocks for the first GPU
handle = self._gpu_handles[0]
max_sm_clk = pynvml.nvmlDeviceGetMaxClockInfo(
handle, pynvml.NVML_CLOCK_SM)
max_mem_clk = pynvml.nvmlDeviceGetMaxClockInfo(
handle, pynvml.NVML_CLOCK_MEM)
return (max_sm_clk, max_mem_clk)
except pynvml.NVMLError as e:
print_warning(f"Failed to get max clock info: {e}")
return None
return None

def _lock_gpu_clocks(self):
"""
Lock GPU clocks to maximum supported frequencies for consistent performance.

Implements fail-fast semantics: if any GPU fails to lock, all operations
are rolled back and an exception is raised.
"""
if self._mobile_disable_clock_locking:
print_info("Clock locking disabled for mobile/Jetson devices")
return

if not self._gpu_handles:
print_warning("No GPU handles available for clock locking")
return

target_clocks = self.get_target_gpu_clocks()
if not target_clocks:
print_warning("Could not determine target GPU clocks")
raise GPUClockLockFailFastError(
"Could not determine target GPU clocks")

target_sm_clk, target_mem_clk = target_clocks

# Phase 1: Retrieve original clocks for all GPUs (fail-fast if any fails)
original_clocks_backup = {}
for gpu_idx, handle in enumerate(self._gpu_handles):
try:
original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock(
handle, pynvml.NVML_CLOCK_SM)
original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock(
handle, pynvml.NVML_CLOCK_MEM)
original_clocks_backup[gpu_idx] = (original_sm_clk,
original_mem_clk)
print_info(
f"GPU {gpu_idx}: Retrieved original clocks SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_error(
f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
)
raise GPUClockLockFailFastError(
f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
)

# Phase 2: Apply clock locks to all GPUs (fail-fast if any fails)
locked_gpus = []
try:
for gpu_idx, handle in enumerate(self._gpu_handles):
try:
pynvml.nvmlDeviceSetApplicationsClocks(
handle, target_mem_clk, target_sm_clk)
locked_gpus.append(gpu_idx)
print_info(
f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}")
# Rollback any GPUs that were successfully locked
self._rollback_locked_gpus(locked_gpus,
original_clocks_backup)
raise GPUClockLockFailFastError(
f"Failed to lock clocks for GPU {gpu_idx}: {e}")

# Phase 3: Only mark as locked if all GPUs succeeded
self._original_clocks = original_clocks_backup
self._clocks_locked = True
print_info(
f"Successfully locked clocks on {len(locked_gpus)} GPU(s)")

except Exception:
# Ensure we don't leave any GPUs in a locked state
if locked_gpus:
self._rollback_locked_gpus(locked_gpus, original_clocks_backup)
raise

def _rollback_locked_gpus(self, locked_gpu_indices, original_clocks_backup):
"""
Rollback clock locks for specific GPUs to their original values.

Args:
locked_gpu_indices: List of GPU indices that were successfully locked
original_clocks_backup: Dictionary of original clock values for each GPU
"""
for gpu_idx in locked_gpu_indices:
if gpu_idx < len(
self._gpu_handles) and gpu_idx in original_clocks_backup:
try:
handle = self._gpu_handles[gpu_idx]
original_sm_clk, original_mem_clk = original_clocks_backup[
gpu_idx]
pynvml.nvmlDeviceSetApplicationsClocks(
handle, original_mem_clk, original_sm_clk)
print_info(
f"GPU {gpu_idx}: Rolled back clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_warning(
f"Failed to rollback clocks for GPU {gpu_idx}: {e}")

def _unlock_gpu_clocks(self):
"""
Restore GPU clocks to their original values.
"""
if not self._clocks_locked or not self._gpu_handles:
return

for gpu_idx, handle in enumerate(self._gpu_handles):
try:
if gpu_idx in self._original_clocks:
original_sm_clk, original_mem_clk = self._original_clocks[
gpu_idx]
pynvml.nvmlDeviceSetApplicationsClocks(
handle, original_mem_clk, original_sm_clk)
print_info(
f"GPU {gpu_idx}: Restored clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
else:
# Reset to default clocks if we don't have original values
pynvml.nvmlDeviceResetApplicationsClocks(handle)
print_info(f"GPU {gpu_idx}: Reset clocks to default")

except pynvml.NVMLError as e:
print_warning(
f"Failed to restore clocks for GPU {gpu_idx}: {e}")

self._clocks_locked = False
self._original_clocks = {}

def __enter__(self):
"""
Do all the steps needed at the start of a test case:
Expand All @@ -154,6 +297,10 @@ def __enter__(self):
for gpu_id in self._gpu_id_list
]
print_info(f"Reinitialized GPU handles: {self._gpu_handles}")

# Lock GPU clocks for consistent performance
self._lock_gpu_clocks()

self.start_monitor()
return self

Expand All @@ -165,6 +312,10 @@ def __exit__(self, *args):
- Validate gpu monitoring result.
"""
self.stop_monitor()

# Restore original GPU clocks
self._unlock_gpu_clocks()

self.validate_gpu_monitoring_data()
print_info("gpu clock lock exit!!!")

Expand Down Expand Up @@ -233,8 +384,8 @@ def teardown(self):
Call when the session finishes. Reset GPU clocks back to its original state.
"""
# Revert clocks back to normal if all tests have finished.
# Set current clock value back to session entry clock.
#self.release_clock()
self._unlock_gpu_clocks()

if self._nvml_initialized:
pynvml.nvmlShutdown()
self._nvml_initialized = False
Expand Down
Loading