Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/integration/defs/accuracy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ meta-llama/Llama-3.1-8B-Instruct:

The first item is the default accuracy specification (i.e., using original Hugging Face model data type and no quantization), and the reference accuracy is 68.17. The second item is an accuracy specification with FP8 GEMM quantization, with a slightly lower reference accuracy 67.93. The third item is a specification with FP8 GEMM and KV cache quantization, with a further slightly lower reference accuracy 67.87.

Model data type and quantization decide the precision in model computation, so accuracy differences can be *justified* if different data types or quantizations are used. Hence, they are the most typical components in accuracy specifications. Please see other categories of accuracy specifications documented in `AccuracyTask.get_num_samples_and_threshold` in [accuracy_core.py](./accuracy_core.py). Note that we exclude most inference features such as parallelism, because theoretically they should not affect model accuracy. Think from the opposite perspective, if enabling tensor parallelism results in statistically significant accuracy loss, we might need to check whether some accuracy bugs exist.
Model data type and quantization decide the precision in model computation, so accuracy differences can be *justified* if different data types or quantizations are used. Hence, they are the most typical components in accuracy specifications. Please see other categories of accuracy specifications documented in `AccuracyTask.get_hypothesis_testing_params` in [accuracy_core.py](./accuracy_core.py). Note that we exclude most inference features such as parallelism, because theoretically they should not affect model accuracy. Think from the opposite perspective, if enabling tensor parallelism results in statistically significant accuracy loss, we might need to check whether some accuracy bugs exist.

A direct implication is that multiple test cases with different features may share the same accuracy reference. This is by design. For example, we should expect a test case with tensor parallelism to have very similar accuracy to its single-GPU counterpart.

Expand Down
139 changes: 95 additions & 44 deletions tests/integration/defs/accuracy/accuracy_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import math
import os
import tempfile
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union

import pytest
Expand Down Expand Up @@ -66,6 +67,57 @@ def compute_threshold(num_samples: int,
return ref_accuracy - z_alpha * scale


@dataclass(slots=True)
class HypothesisTestingParams:
ref_accuracy: float
num_samples: int
alpha: float = 0.05
beta: float = 0.2
sigma: float = 50.0
higher_is_better: bool = True
theta: float = field(init=False)
threshold: float = field(init=False)

def __post_init__(self) -> None:
self.theta = compute_theta(self.num_samples,
sigma=self.sigma,
alpha=self.alpha,
beta=self.beta)
self.threshold = compute_threshold(
self.num_samples,
self.ref_accuracy,
sigma=self.sigma,
alpha=self.alpha,
higher_is_better=self.higher_is_better)

def report(self, accuracy: Optional[float] = None) -> str:
report = f"""===========================================================
= ACCURACY HYPOTHESIS TESTING
===========================================================
Alpha (Type I: False Positive): {self.alpha:.3f}
Beta (Type II: False Negative): {self.beta:.3f}
Sigma (Standard deviation): {self.sigma:.3f}
#Samples: {self.num_samples}
Higher is better: {self.higher_is_better}
Theta (Minimum detectable effect): {self.theta:.3f}
Reference accuracy: {self.ref_accuracy:.3f}
Threshold: {self.threshold:.3f}
==========================================================="""
if accuracy is not None:
report = f"""{report}
Evaluated accuracy: {accuracy:.3f}
==========================================================="""
return report

def assert_passing(self, accuracy: float) -> None:
compare_op = ">=" if self.higher_is_better else "<="
err_msg = f"Reference accuracy is {self.ref_accuracy:.3f}, threshold is {self.threshold:.3f}. Expected accuracy {compare_op} threshold, but got {accuracy:.3f}. Please see hypothesis testing report:\n{self.report(accuracy)}"
if self.higher_is_better:
assert accuracy >= self.threshold, err_msg
else:
assert accuracy <= self.threshold, err_msg


class AccuracyTask:
REFERENCE_DIR = f"{os.path.dirname(__file__)}/references"

Expand Down Expand Up @@ -93,8 +145,9 @@ def __init__(self, model_name: str):
with open(f"{self.REFERENCE_DIR}/{self.DATASET}.yaml") as f:
self.reference: List[dict] = yaml.safe_load(f).get(model_name, [])

def get_num_samples_and_threshold(self, **acc_specs):
"""Get num_samples and threshold via accuracy specifications.
def get_hypothesis_testing_params(self,
**acc_specs) -> HypothesisTestingParams:
"""Get hypothesis testing parameters via accuracy specifications.

Args:
acc_specs: Accuracy specifications, currently including:
Expand All @@ -119,30 +172,14 @@ def get_num_samples_and_threshold(self, **acc_specs):
else:
raise ValueError(f"Not registered specs: {acc_specs}.")

accuracy = entry.get("accuracy")
alpha = entry.get("alpha", self.ALPHA)
beta = entry.get("beta", self.BETA)
sigma = entry.get("sigma", self.SIGMA)
num_samples = entry.get("num_samples", self.NUM_SAMPLES)
higher_is_better = entry.get("higher_is_better", self.HIGHER_IS_BETTER)
theta = compute_theta(num_samples, sigma=sigma, alpha=alpha, beta=beta)
threshold = compute_threshold(num_samples,
accuracy,
sigma=sigma,
alpha=alpha,
higher_is_better=higher_is_better)
print("===========================================================\n"
"= ACCURACY HYPOTHESIS TESTING\n"
"===========================================================\n"
f"Alpha (Type I: False Positive): {alpha:.3f}\n"
f"Beta (Type II: False Negative): {beta:.3f}\n"
f"Sigma (Standard deviation): {sigma:.3f}\n"
f"#Samples: {num_samples}\n"
f"Theta (Minimum detectable effect): {theta:.3f}\n"
f"Reference accuracy: {accuracy:.3f}\n"
f"Threshold: {threshold:.3f}\n"
"===========================================================\n")
return num_samples, threshold
return HypothesisTestingParams(
ref_accuracy=entry.get("accuracy"),
alpha=entry.get("alpha", self.ALPHA),
beta=entry.get("beta", self.BETA),
sigma=entry.get("sigma", self.SIGMA),
num_samples=entry.get("num_samples", self.NUM_SAMPLES),
higher_is_better=entry.get("higher_is_better",
self.HIGHER_IS_BETTER))

def evaluate(self,
llm: Union[LLM, PyTorchLLM, AutoDeployLLM],
Expand All @@ -165,13 +202,13 @@ def evaluate(self,
is_integration_test = os.getenv('INTEGRATION_TEST', '0') == '1'

if is_integration_test:
num_samples = 1
logger.info(
"Running in INTEGRATION_TEST mode: using only 1 sample and skipping accuracy verification"
)
threshold = 0
hypothesis_testing_params = HypothesisTestingParams(ref_accuracy=0,
num_samples=1)
else:
num_samples, threshold = self.get_num_samples_and_threshold(
hypothesis_testing_params = self.get_hypothesis_testing_params(
dtype=llm.args.dtype,
quant_algo=llm.args.quant_config.quant_algo,
kv_cache_quant_algo=llm.args.quant_config.kv_cache_quant_algo,
Expand All @@ -193,17 +230,19 @@ def evaluate(self,
evaluator_kwargs.update(self.EVALUATOR_KWARGS)
if extra_evaluator_kwargs is not None:
evaluator_kwargs.update(extra_evaluator_kwargs)
evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
**evaluator_kwargs)
evaluator = self.EVALUATOR_CLS(
num_samples=hypothesis_testing_params.num_samples,
**evaluator_kwargs)
evaluate_kwargs = {}
if hasattr(self, 'EVALUATE_KWARGS'):
evaluate_kwargs.update(self.EVALUATE_KWARGS)
accuracy = evaluator.evaluate(llm, sampling_params, streaming,
**evaluate_kwargs)
if self.HIGHER_IS_BETTER:
assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
else:
assert accuracy <= threshold, f"Expected accuracy <= {threshold}, but got {accuracy}."

logger.info(
f"Hypothesis testing report:\n{hypothesis_testing_params.report(accuracy)}"
)
hypothesis_testing_params.assert_passing(accuracy)


class CnnDailymail(AccuracyTask):
Expand Down Expand Up @@ -457,7 +496,7 @@ def initialize_case(self,
self.env = env

def convert(self):
print("Converting model to TensorRT-LLM checkpoint...")
logger.info("Converting model to TensorRT-LLM checkpoint...")

is_prequantized = False
for quant_config_file in [
Expand Down Expand Up @@ -559,7 +598,7 @@ def convert(self):
venv_check_call(self.llm_venv, convert_cmd)

def build(self):
print("Building engines...")
logger.info("Building engines...")
max_batch_size = max(task.MAX_BATCH_SIZE for task in self.tasks)
max_input_len = max(task.MAX_INPUT_LEN for task in self.tasks)
max_seq_len = max(task.MAX_INPUT_LEN + task.MAX_OUTPUT_LEN
Expand All @@ -578,7 +617,7 @@ def build(self):
check_call(" ".join(build_cmd), shell=True, env=self.llm_venv._new_env)

def summarize(self, task: AccuracyTask):
print("Running summarize...")
logger.info("Running summarize...")
summarize_cmd = [
f"{self.llm_root}/examples/summarize.py",
f"--engine_dir={self.engine_dir}",
Expand All @@ -595,12 +634,16 @@ def summarize(self, task: AccuracyTask):
"--no_add_special_tokens"
])

num_samples, threshold = task.get_num_samples_and_threshold(
hypothesis_testing_params = task.get_hypothesis_testing_params(
dtype=self.dtype,
quant_algo=self.quant_algo,
kv_cache_quant_algo=self.kv_cache_quant_algo,
spec_dec_algo=self.spec_dec_algo,
extra_acc_spec=self.extra_acc_spec)
logger.info(
f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
num_samples = hypothesis_testing_params.num_samples
threshold = hypothesis_testing_params.threshold

if num_samples < task.MAX_BATCH_SIZE:
max_ite = 1
Expand Down Expand Up @@ -642,13 +685,17 @@ def summarize(self, task: AccuracyTask):
str(world_size), "--allow-run-as-root"], summarize_cmd)

def mmlu(self, task: AccuracyTask):
print("Running mmlu...")
num_samples, threshold = task.get_num_samples_and_threshold(
logger.info("Running mmlu...")
hypothesis_testing_params = task.get_hypothesis_testing_params(
dtype=self.dtype,
quant_algo=self.quant_algo,
kv_cache_quant_algo=self.kv_cache_quant_algo,
spec_dec_algo=self.spec_dec_algo,
extra_acc_spec=self.extra_acc_spec)
logger.info(
f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
num_samples = hypothesis_testing_params.num_samples
threshold = hypothesis_testing_params.threshold

mmlu_cmd = [
"trtllm-eval",
Expand All @@ -669,27 +716,31 @@ def mmlu(self, task: AccuracyTask):
check_call(" ".join(mmlu_cmd), shell=True, env=self.llm_venv._new_env)

def eval_long_context(self, task: AccuracyTask):
print("Running construct_synthetic_dataset...")
logger.info("Running construct_synthetic_dataset...")
data_gen_cmd = [
f"{self.llm_root}/examples/infinitebench/construct_synthetic_dataset.py",
"--test_case=build_passkey", f"--test_level={task.LEVEL}"
]
venv_check_call(self.llm_venv, data_gen_cmd)

print("Running eval_long_context...")
logger.info("Running eval_long_context...")
eval_cmd = [
f"{self.llm_root}/examples/eval_long_context.py", "--task=passkey",
f"--engine_dir={self.engine_dir}",
f"--tokenizer_dir={self.MODEL_PATH}",
f"--max_input_length={task.MAX_INPUT_LEN}",
"--enable_chunked_context"
]
num_samples, threshold = task.get_num_samples_and_threshold(
hypothesis_testing_params = task.get_hypothesis_testing_params(
dtype=self.dtype,
quant_algo=self.quant_algo,
kv_cache_quant_algo=self.kv_cache_quant_algo,
spec_dec_algo=self.spec_dec_algo,
extra_acc_spec=self.extra_acc_spec)
logger.info(
f"Hypothesis testing report:\n{hypothesis_testing_params.report()}")
num_samples = hypothesis_testing_params.num_samples
threshold = hypothesis_testing_params.threshold

batch_size = min(task.MAX_BATCH_SIZE, num_samples)
eval_cmd.extend([
Expand Down
Loading