NVIDIA · syuoni · Sep 26, 2025 · Sep 25, 2025 · Sep 26, 2025
diff --git a/tests/integration/defs/accuracy/README.md b/tests/integration/defs/accuracy/README.md
@@ -125,7 +125,7 @@ meta-llama/Llama-3.1-8B-Instruct:
 
 The first item is the default accuracy specification (i.e., using original Hugging Face model data type and no quantization), and the reference accuracy is 68.17. The second item is an accuracy specification with FP8 GEMM quantization, with a slightly lower reference accuracy 67.93. The third item is a specification with FP8 GEMM and KV cache quantization, with a further slightly lower reference accuracy 67.87.
 
-Model data type and quantization decide the precision in model computation, so accuracy differences can be *justified* if different data types or quantizations are used. Hence, they are the most typical components in accuracy specifications. Please see other categories of accuracy specifications documented in `AccuracyTask.get_num_samples_and_threshold` in [accuracy_core.py](./accuracy_core.py). Note that we exclude most inference features such as parallelism, because theoretically they should not affect model accuracy. Think from the opposite perspective, if enabling tensor parallelism results in statistically significant accuracy loss, we might need to check whether some accuracy bugs exist.
+Model data type and quantization decide the precision in model computation, so accuracy differences can be *justified* if different data types or quantizations are used. Hence, they are the most typical components in accuracy specifications. Please see other categories of accuracy specifications documented in `AccuracyTask.get_hypothesis_testing_params` in [accuracy_core.py](./accuracy_core.py). Note that we exclude most inference features such as parallelism, because theoretically they should not affect model accuracy. Think from the opposite perspective, if enabling tensor parallelism results in statistically significant accuracy loss, we might need to check whether some accuracy bugs exist.
 
 A direct implication is that multiple test cases with different features may share the same accuracy reference. This is by design. For example, we should expect a test case with tensor parallelism to have very similar accuracy to its single-GPU counterpart.
 

diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
@@ -16,6 +16,7 @@
 import math
 import os
 import tempfile
+from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
 import pytest
@@ -66,6 +67,57 @@ def compute_threshold(num_samples: int,
         return ref_accuracy - z_alpha * scale
 
 
+@dataclass(slots=True)
+class HypothesisTestingParams:
+    ref_accuracy: float
+    num_samples: int
+    alpha: float = 0.05
+    beta: float = 0.2
+    sigma: float = 50.0
+    higher_is_better: bool = True
+    theta: float = field(init=False)
+    threshold: float = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.theta = compute_theta(self.num_samples,
+                                   sigma=self.sigma,
+                                   alpha=self.alpha,
+                                   beta=self.beta)
+        self.threshold = compute_threshold(
+            self.num_samples,
+            self.ref_accuracy,
+            sigma=self.sigma,
+            alpha=self.alpha,
+            higher_is_better=self.higher_is_better)
+
+    def report(self, accuracy: Optional[float] = None) -> str:
+        report = f"""===========================================================
+= ACCURACY HYPOTHESIS TESTING
+===========================================================
+Alpha (Type I:  False Positive): {self.alpha:.3f}
+Beta  (Type II: False Negative): {self.beta:.3f}
+Sigma (Standard deviation): {self.sigma:.3f}
+#Samples: {self.num_samples}
+Higher is better: {self.higher_is_better}
+Theta (Minimum detectable effect): {self.theta:.3f}
+Reference accuracy: {self.ref_accuracy:.3f}
+Threshold: {self.threshold:.3f}
+==========================================================="""
+        if accuracy is not None:
+            report = f"""{report}
+Evaluated accuracy: {accuracy:.3f}
+==========================================================="""
+        return report
+
+    def assert_passing(self, accuracy: float) -> None:
+        compare_op = ">=" if self.higher_is_better else "<="
+        err_msg = f"Reference accuracy is {self.ref_accuracy:.3f}, threshold is {self.threshold:.3f}. Expected accuracy {compare_op} threshold, but got {accuracy:.3f}. Please see hypothesis testing report:\n{self.report(accuracy)}"
+        if self.higher_is_better:
+            assert accuracy >= self.threshold, err_msg
+        else:
+            assert accuracy <= self.threshold, err_msg
+
+
 class AccuracyTask:
     REFERENCE_DIR = f"{os.path.dirname(__file__)}/references"
 
@@ -93,8 +145,9 @@ def __init__(self, model_name: str):
         with open(f"{self.REFERENCE_DIR}/{self.DATASET}.yaml") as f:
             self.reference: List[dict] = yaml.safe_load(f).get(model_name, [])
 
-    def get_num_samples_and_threshold(self, **acc_specs):
-        """Get num_samples and threshold via accuracy specifications.
+    def get_hypothesis_testing_params(self,
+                                      **acc_specs) -> HypothesisTestingParams:
+        """Get hypothesis testing parameters via accuracy specifications.
 
         Args:
             acc_specs: Accuracy specifications, currently including:
@@ -119,30 +172,14 @@ def get_num_samples_and_threshold(self, **acc_specs):
             else:
                 raise ValueError(f"Not registered specs: {acc_specs}.")
 
-        accuracy = entry.get("accuracy")
-        alpha = entry.get("alpha", self.ALPHA)
-        beta = entry.get("beta", self.BETA)
-        sigma = entry.get("sigma", self.SIGMA)
-        num_samples = entry.get("num_samples", self.NUM_SAMPLES)
-        higher_is_better = entry.get("higher_is_better", self.HIGHER_IS_BETTER)
-        theta = compute_theta(num_samples, sigma=sigma, alpha=alpha, beta=beta)
-        threshold = compute_threshold(num_samples,
-                                      accuracy,
-                                      sigma=sigma,
-                                      alpha=alpha,
-                                      higher_is_better=higher_is_better)
-        print("===========================================================\n"
-              "= ACCURACY HYPOTHESIS TESTING\n"
-              "===========================================================\n"
-              f"Alpha (Type I:  False Positive): {alpha:.3f}\n"
-              f"Beta  (Type II: False Negative): {beta:.3f}\n"
-              f"Sigma (Standard deviation): {sigma:.3f}\n"
-              f"#Samples: {num_samples}\n"
-              f"Theta (Minimum detectable effect): {theta:.3f}\n"
-              f"Reference accuracy: {accuracy:.3f}\n"
-              f"Threshold: {threshold:.3f}\n"
-              "===========================================================\n")
-        return num_samples, threshold
+        return HypothesisTestingParams(
+            ref_accuracy=entry.get("accuracy"),
+            alpha=entry.get("alpha", self.ALPHA),
+            beta=entry.get("beta", self.BETA),
+            sigma=entry.get("sigma", self.SIGMA),
+            num_samples=entry.get("num_samples", self.NUM_SAMPLES),
+            higher_is_better=entry.get("higher_is_better",
+                                       self.HIGHER_IS_BETTER))
 
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM, AutoDeployLLM],
@@ -165,13 +202,13 @@ def evaluate(self,
         is_integration_test = os.getenv('INTEGRATION_TEST', '0') == '1'
 
         if is_integration_test:
-            num_samples = 1
             logger.info(
                 "Running in INTEGRATION_TEST mode: using only 1 sample and skipping accuracy verification"
             )
-            threshold = 0
+            hypothesis_testing_params = HypothesisTestingParams(ref_accuracy=0,
+                                                                num_samples=1)
         else:
-            num_samples, threshold = self.get_num_samples_and_threshold(
+            hypothesis_testing_params = self.get_hypothesis_testing_params(
                 dtype=llm.args.dtype,
                 quant_algo=llm.args.quant_config.quant_algo,
                 kv_cache_quant_algo=llm.args.quant_config.kv_cache_quant_algo,
@@ -193,17 +230,19 @@ def evaluate(self,
             evaluator_kwargs.update(self.EVALUATOR_KWARGS)
         if extra_evaluator_kwargs is not None:
             evaluator_kwargs.update(extra_evaluator_kwargs)
-        evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
-                                       **evaluator_kwargs)
+        evaluator = self.EVALUATOR_CLS(
+            num_samples=hypothesis_testing_params.num_samples,
+            **evaluator_kwargs)
         evaluate_kwargs = {}
         if hasattr(self, 'EVALUATE_KWARGS'):
             evaluate_kwargs.update(self.EVALUATE_KWARGS)
         accuracy = evaluator.evaluate(llm, sampling_params, streaming,
                                       **evaluate_kwargs)
-        if self.HIGHER_IS_BETTER:
-            assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
-        else:
-            assert accuracy <= threshold, f"Expected accuracy <= {threshold}, but got {accuracy}."
+
+        logger.info(
+            f"Hypothesis testing report:\n{hypothesis_testing_params.report(accuracy)}"
+        )
+        hypothesis_testing_params.assert_passing(accuracy)
 
 
 class CnnDailymail(AccuracyTask):
@@ -457,7 +496,7 @@ def initialize_case(self,
         self.env = env
 
     def convert(self):
-        print("Converting model to TensorRT-LLM checkpoint...")
+        logger.info("Converting model to TensorRT-LLM checkpoint...")
 
         is_prequantized = False
         for quant_config_file in [
@@ -559,7 +598,7 @@ def convert(self):
         venv_check_call(self.llm_venv, convert_cmd)
 
     def build(self):
-        print("Building engines...")
+        logger.info("Building engines...")
         max_batch_size = max(task.MAX_BATCH_SIZE for task in self.tasks)
         max_input_len = max(task.MAX_INPUT_LEN for task in self.tasks)
         max_seq_len = max(task.MAX_INPUT_LEN + task.MAX_OUTPUT_LEN
@@ -578,7 +617,7 @@ def build(self):
         check_call(" ".join(build_cmd), shell=True, env=self.llm_venv._new_env)
 
     def summarize(self, task: AccuracyTask):
-        print("Running summarize...")
+        logger.info("Running summarize...")
         summarize_cmd = [
             f"{self.llm_root}/examples/summarize.py",
             f"--engine_dir={self.engine_dir}",
@@ -595,12 +634,16 @@ def summarize(self, task: AccuracyTask):
                 "--no_add_special_tokens"
             ])
 
-        num_samples, threshold = task.get_num_samples_and_threshold(
+        hypothesis_testing_params = task.get_hypothesis_testing_params(
             dtype=self.dtype,
             quant_algo=self.quant_algo,
             kv_cache_quant_algo=self.kv_cache_quant_algo,
             spec_dec_algo=self.spec_dec_algo,
             extra_acc_spec=self.extra_acc_spec)
+        logger.info(
+            f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
+        num_samples = hypothesis_testing_params.num_samples
+        threshold = hypothesis_testing_params.threshold
 
         if num_samples < task.MAX_BATCH_SIZE:
             max_ite = 1
@@ -642,13 +685,17 @@ def summarize(self, task: AccuracyTask):
                  str(world_size), "--allow-run-as-root"], summarize_cmd)
 
     def mmlu(self, task: AccuracyTask):
-        print("Running mmlu...")
-        num_samples, threshold = task.get_num_samples_and_threshold(
+        logger.info("Running mmlu...")
+        hypothesis_testing_params = task.get_hypothesis_testing_params(
             dtype=self.dtype,
             quant_algo=self.quant_algo,
             kv_cache_quant_algo=self.kv_cache_quant_algo,
             spec_dec_algo=self.spec_dec_algo,
             extra_acc_spec=self.extra_acc_spec)
+        logger.info(
+            f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
+        num_samples = hypothesis_testing_params.num_samples
+        threshold = hypothesis_testing_params.threshold
 
         mmlu_cmd = [
             "trtllm-eval",
@@ -669,27 +716,31 @@ def mmlu(self, task: AccuracyTask):
         check_call(" ".join(mmlu_cmd), shell=True, env=self.llm_venv._new_env)
 
     def eval_long_context(self, task: AccuracyTask):
-        print("Running construct_synthetic_dataset...")
+        logger.info("Running construct_synthetic_dataset...")
         data_gen_cmd = [
             f"{self.llm_root}/examples/infinitebench/construct_synthetic_dataset.py",
             "--test_case=build_passkey", f"--test_level={task.LEVEL}"
         ]
         venv_check_call(self.llm_venv, data_gen_cmd)
 
-        print("Running eval_long_context...")
+        logger.info("Running eval_long_context...")
         eval_cmd = [
             f"{self.llm_root}/examples/eval_long_context.py", "--task=passkey",
             f"--engine_dir={self.engine_dir}",
             f"--tokenizer_dir={self.MODEL_PATH}",
             f"--max_input_length={task.MAX_INPUT_LEN}",
             "--enable_chunked_context"
         ]
-        num_samples, threshold = task.get_num_samples_and_threshold(
+        hypothesis_testing_params = task.get_hypothesis_testing_params(
             dtype=self.dtype,
             quant_algo=self.quant_algo,
             kv_cache_quant_algo=self.kv_cache_quant_algo,
             spec_dec_algo=self.spec_dec_algo,
             extra_acc_spec=self.extra_acc_spec)
+        logger.info(
+            f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
+        num_samples = hypothesis_testing_params.num_samples
+        threshold = hypothesis_testing_params.threshold
 
         batch_size = min(task.MAX_BATCH_SIZE, num_samples)
         eval_cmd.extend([