diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 0abe02000b4..2006f65e81b 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -206,8 +206,7 @@ def __init__(self, self.get_max_draft_len()) self.streaming = streaming - @staticmethod - def _query_gpu_info() -> Dict[str, Any]: + def _query_gpu_info(self) -> Dict[str, Any]: """Query first GPU info (all GPUs must be identical for TRT-LLM).""" if not torch.cuda.is_available(): return None @@ -227,13 +226,22 @@ def _query_gpu_info() -> Dict[str, Any]: None, } if pynvml: - # Memory clock information is not reported by torch, using NVML instead - handle = pynvml.nvmlDeviceGetHandleByIndex(physical_idx) - gpu_info["clocks.mem"] = pynvml.nvmlDeviceGetMaxClockInfo( - handle, pynvml.NVML_CLOCK_MEM) / 1000.0 - return gpu_info - except (RuntimeError, AssertionError): + try: + # Memory clock information is not reported by torch, using NVML instead + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_idx) + clocks_mem = pynvml.nvmlDeviceGetMaxClockInfo( + handle, pynvml.NVML_CLOCK_MEM) / 1000.0 + gpu_info["clocks.mem"] = clocks_mem + except pynvml.NVMLError as e: + self.logger.info( + f"Error querying GPU clock info with NVML: {e}") + gpu_info["clocks.mem"] = None + except Exception as e: + # broad catch for any other errors, since this is a non-critical operation + self.logger.warning(f"Error querying GPU info: {e}") return None + return gpu_info @staticmethod def convert_to_ms(ns: float) -> float: