diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py index 2ea02130d6b..aa26b1fcc0b 100644 --- a/benchmarks/accuracy/text_to_image/gbench.py +++ b/benchmarks/accuracy/text_to_image/gbench.py @@ -116,10 +116,10 @@ def collect_gebench_generation_summary(output_root: Path) -> dict[str, Any]: for lang_dir in sorted(path for path in type_root.iterdir() if path.is_dir()): for sample_dir in sorted(path for path in lang_dir.iterdir() if path.is_dir()): expected = sample_dir / "frame5.png" if data_type in {"type2", "type3", "type4"} else None - if expected is None: + if expected is None or not expected.exists(): + # t2i-only runs emit frame0 for type3/type4 instead of the + # six-frame trajectory output, so summarize any image found. expected = find_first_image(sample_dir) - elif not expected.exists(): - expected = None if expected is None: continue records.append( @@ -437,6 +437,32 @@ def _build_scoring_prompt(self, task_prompt: str) -> str: f"{task_prompt}" ) + def _build_t2i_scoring_prompt(self, task_prompt: str) -> str: + return ( + "You are an expert evaluator for text-to-image generation quality.\n" + "Evaluate the single generated image against the given instruction.\n\n" + "Score these five dimensions from 0 to 5:\n" + "- goal: whether the image content matches the instruction accurately\n" + "- logic: whether objects, relationships and composition are correct\n" + "- cons: whether colors, style and lighting are internally consistent\n" + "- ui: whether the overall visual layout and structure looks realistic\n" + "- qual: whether the image is visually sharp and artifact-free\n\n" + "Be strict: only give 5 if the image is excellent in that dimension. " + "Give 3 for acceptable, 1-2 for poor, 0 for completely wrong.\n\n" + "Return JSON only. Do not add any prose outside JSON.\n" + "Use exactly this schema:\n" + "{\n" + ' "goal": 0,\n' + ' "logic": 0,\n' + ' "cons": 0,\n' + ' "ui": 0,\n' + ' "qual": 0,\n' + ' "reasoning": "short explanation"\n' + "}\n\n" + "Scoring task:\n" + f"{task_prompt}" + ) + def _request_text(self, prompt: str, images: list[Image.Image]) -> str: content: list[dict[str, Any]] = [{"type": "text", "text": prompt}] for image in images: @@ -461,14 +487,15 @@ def _request_text(self, prompt: str, images: list[Image.Image]) -> str: return "\n".join(part.get("text", "") for part in message_content if part.get("type") == "text") return str(message_content) - def evaluate(self, *, prompt: str, images: list[Image.Image]) -> dict[str, Any]: - primary_prompt = self._build_scoring_prompt(prompt) + def evaluate(self, *, prompt: str, images: list[Image.Image], t2i_mode: bool = False) -> dict[str, Any]: + build = self._build_t2i_scoring_prompt if t2i_mode else self._build_scoring_prompt + primary_prompt = build(prompt) text = self._request_text(primary_prompt, images) try: return extract_json_object(text) except ValueError: retry_prompt = ( - self._build_scoring_prompt(prompt) + "\n\nYour previous response was not valid JSON. " + build(prompt) + "\n\nYour previous response was not valid JSON. " "Return only the JSON object with integer scores." ) retry_text = self._request_text(retry_prompt, images) @@ -496,10 +523,11 @@ def __init__( api_key: str = "EMPTY", width: int = 768, height: int = 576, - num_inference_steps: int = 8, + num_inference_steps: int = 50, output_compression: int | None = 98, guidance_scale: float | None = None, seed: int | None = 42, + t2i_only: bool = False, ): self.dataset_root = dataset_root self.output_root = output_root @@ -510,6 +538,7 @@ def __init__( self.output_compression = output_compression self.guidance_scale = guidance_scale self.seed = seed + self.t2i_only = t2i_only self.client = VllmOmniImageClient(base_url=base_url, api_key=api_key) def generate( @@ -544,6 +573,8 @@ def generate( return results def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[str, Any] | None: + if self.t2i_only and data_type not in {"type3", "type4"}: + return None sample_path = sample_spec.sample_path metadata = sample_spec.metadata lang_device = sample_spec.lang_device @@ -635,6 +666,13 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[ ) save_image(frame0_path, previous) + if self.t2i_only: + return { + "data_type": data_type, + "sample_name": f"{lang_device}/{sample_name}", + "output_path": str(frame0_path), + } + for step_num in range(1, 6): frame_path = output_dir / f"frame{step_num}.png" if frame_path.exists(): @@ -696,10 +734,11 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[ class GEBenchEvaluator: - def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient): + def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient, t2i_only: bool = False): self.dataset_root = dataset_root self.output_root = output_root self.judge = judge + self.t2i_only = t2i_only def evaluate( self, @@ -783,13 +822,33 @@ def _evaluate_one(self, data_type: str, sample_dir: Path, sample_spec: GEBenchSa images=judge_images, ) elif data_type in {"type3", "type4"}: - frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)] - instruction = _text_or_default(metadata.get("instruction") or metadata.get("caption"), "Complete the task.") - prompt_suffix, judge_images = _trajectory_judge_payload(frames) - raw_scores = self.judge.evaluate( - prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}", - images=judge_images, - ) + if self.t2i_only: + frame0_path = sample_dir / "frame0.png" + if not frame0_path.exists(): + return None + generated = Image.open(frame0_path).convert("RGB") + instruction = _text_or_default( + metadata.get("instruction") or metadata.get("caption"), "Generate an image." + ) + raw_scores = self.judge.evaluate( + prompt=( + f"Evaluate the quality of this generated image.\n" + f"Instruction: {instruction}\n" + f"Rate how well the image matches the instruction." + ), + images=[generated], + t2i_mode=True, + ) + else: + frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)] + instruction = _text_or_default( + metadata.get("instruction") or metadata.get("caption"), "Complete the task." + ) + prompt_suffix, judge_images = _trajectory_judge_payload(frames) + raw_scores = self.judge.evaluate( + prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}", + images=judge_images, + ) elif data_type == "type5": source = _resolve_referenced_image( metadata=metadata, sample_path=dataset_sample, dataset_root=self.dataset_root, data_type=data_type @@ -829,13 +888,19 @@ def build_parser() -> argparse.ArgumentParser: generate.add_argument("--api-key", type=str, default="EMPTY") generate.add_argument("--width", type=int, default=768) generate.add_argument("--height", type=int, default=576) - generate.add_argument("--num-inference-steps", type=int, default=8) + generate.add_argument("--num-inference-steps", type=int, default=50) generate.add_argument("--output-compression", type=int, default=98) generate.add_argument("--guidance-scale", type=float, default=None) generate.add_argument("--seed", type=int, default=42) generate.add_argument("--workers", type=int, default=1) generate.add_argument("--max-samples", type=int, default=None) generate.add_argument("--samples-per-type", type=int, default=None) + generate.add_argument( + "--t2i-only", + action="store_true", + default=False, + help="Only generate T2I frame0 for type3/type4, skip IT2I edits and type1/2/5", + ) evaluate = subparsers.add_parser("evaluate") evaluate.add_argument("--dataset-root", type=Path, required=True) @@ -847,6 +912,12 @@ def build_parser() -> argparse.ArgumentParser: evaluate.add_argument("--workers", type=int, default=1) evaluate.add_argument("--max-samples", type=int, default=None) evaluate.add_argument("--samples-per-type", type=int, default=None) + evaluate.add_argument( + "--t2i-only", + action="store_true", + default=False, + help="Only evaluate frame0 for type3/type4 (matches --t2i-only in generate)", + ) summarize = subparsers.add_parser("summarize") summarize.add_argument("--output-root", type=Path, required=True) @@ -871,6 +942,7 @@ def main(argv: list[str] | None = None) -> int: output_compression=args.output_compression, guidance_scale=args.guidance_scale, seed=args.seed, + t2i_only=args.t2i_only, ) records: list[dict[str, Any]] = [] for data_type in _data_types_arg(args.data_type): @@ -892,7 +964,9 @@ def main(argv: list[str] | None = None) -> int: api_key=args.judge_api_key, model=args.judge_model, ) - evaluator = GEBenchEvaluator(dataset_root=args.dataset_root, output_root=args.output_root, judge=judge) + evaluator = GEBenchEvaluator( + dataset_root=args.dataset_root, output_root=args.output_root, judge=judge, t2i_only=args.t2i_only + ) combined_results: list[dict[str, Any]] = [] for data_type in _data_types_arg(args.data_type): payload = evaluator.evaluate( diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index a4804fc1980..3328995faf3 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os import subprocess from contextlib import contextmanager @@ -62,6 +63,58 @@ def pytest_addoption(parser): default=1200, help="Online serving timeout in seconds for Wan2.2 I2V accuracy tests.", ) + group.addoption( + "--gebench-devices", + action="store", + default=None, + help="CUDA_VISIBLE_DEVICES for GEBench generate server (e.g. '0,1,2,3'); TP size is derived from device count", + ) + group.addoption( + "--gebench-stage-overrides", + action="store", + default=None, + help="JSON string passed to --stage-overrides for GEBench generate server", + ) + group.addoption( + "--gebench-extra-server-args", + action="store", + default=None, + help='JSON array of extra CLI args for GEBench generate server (e.g. \'["--dtype","bfloat16"]\')', + ) + group.addoption( + "--gebench-num-inference-steps", + action="store", + type=int, + default=50, + help="Number of diffusion inference steps for GEBench generate", + ) + group.addoption( + "--gebench-t2i-only", + action="store_true", + default=False, + help="Only generate/evaluate T2I frame0 for type3/type4, skip IT2I trajectory", + ) + group.addoption( + "--gebench-min-overall", + action="store", + type=float, + default=0.45, + help="Minimum overall GEBench mean score for the smoke test", + ) + group.addoption( + "--gebench-min-type3", + action="store", + type=float, + default=0.45, + help="Minimum GEBench type3 mean score for the smoke test", + ) + group.addoption( + "--gebench-min-type4", + action="store", + type=float, + default=0.45, + help="Minimum GEBench type4 mean score for the smoke test", + ) def _hf_cache_root() -> Path: @@ -173,6 +226,25 @@ def gebench_samples_per_type(request: pytest.FixtureRequest) -> int: return int(request.config.getoption("gebench_samples_per_type")) +@pytest.fixture(scope="session") +def gebench_num_inference_steps(request: pytest.FixtureRequest) -> int: + return int(request.config.getoption("gebench_num_inference_steps")) + + +@pytest.fixture(scope="session") +def gebench_t2i_only(request: pytest.FixtureRequest) -> bool: + return bool(request.config.getoption("gebench_t2i_only")) + + +@pytest.fixture(scope="session") +def gebench_min_scores(request: pytest.FixtureRequest) -> dict[str, float]: + return { + "overall": float(request.config.getoption("gebench_min_overall")), + "type3": float(request.config.getoption("gebench_min_type3")), + "type4": float(request.config.getoption("gebench_min_type4")), + } + + @pytest.fixture(scope="session") def gedit_samples_per_group(request: pytest.FixtureRequest) -> int: return int(request.config.getoption("gedit_samples_per_group")) @@ -229,36 +301,58 @@ def _build_accuracy_server_config( port: int, run_level: str, model_prefix: str, + generate_devices: str | None = None, + extra_generate_args: list[str] | None = None, + stage_init_timeout: int = 300, + init_timeout: int | None = None, ) -> AccuracyServerConfig: if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 CUDA GPU for accuracy benchmark smoke tests.") if not generate_model: pytest.skip("No generate model configured for accuracy benchmark test.") - generate_server_args = ["--num-gpus", "1"] + + devices = generate_devices or shared_gpu + num_devices = len([d for d in devices.split(",") if d.strip()]) + if torch.accelerator.device_count() < num_devices: + pytest.skip(f"Need at least {num_devices} CUDA GPUs for this accuracy benchmark.") + + if extra_generate_args is not None: + has_gpu_allocation_arg = any( + arg in {"--tensor-parallel-size", "--num-gpus"} + or arg.startswith("--tensor-parallel-size=") + or arg.startswith("--num-gpus=") + for arg in extra_generate_args + ) + if not has_gpu_allocation_arg: + raise ValueError("extra_generate_args must include --tensor-parallel-size or --num-gpus") + generate_server_args = extra_generate_args if extra_generate_args is not None else ["--num-gpus", "1"] judge_server_args = [ "--max-model-len", "32768", "--gpu-memory-utilization", "0.8", + "--enforce-eager", ] - judge_env = {"CUDA_VISIBLE_DEVICES": shared_gpu} + generate_params_kwargs: dict = dict( + model=generate_model, + port=port, + server_args=generate_server_args, + env_dict={"CUDA_VISIBLE_DEVICES": devices}, + use_omni=True, + stage_init_timeout=stage_init_timeout, + ) + if init_timeout is not None: + generate_params_kwargs["init_timeout"] = init_timeout return AccuracyServerConfig( - generate_params=OmniServerParams( - model=generate_model, - port=port, - server_args=generate_server_args, - env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu}, - use_omni=True, - stage_init_timeout=300, - ), + generate_params=OmniServerParams(**generate_params_kwargs), judge_params=OmniServerParams( model=judge_model, port=port, server_args=judge_server_args, - env_dict=judge_env, + env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu}, use_omni=False, ), run_level=run_level, @@ -272,6 +366,24 @@ def gebench_accuracy_servers( run_level: str, model_prefix: str, ) -> AccuracyServerConfig: + devices_opt: str | None = request.config.getoption("gebench_devices") + stage_overrides: str | None = request.config.getoption("gebench_stage_overrides") + extra_args_json: str | None = request.config.getoption("gebench_extra_server_args") + + extra_args: list[str] | None = None + stage_init_timeout = 300 + init_timeout: int | None = None + + if devices_opt: + num_devices = len([d for d in devices_opt.split(",") if d.strip()]) + extra_args = ["--tensor-parallel-size", str(num_devices)] + if stage_overrides: + extra_args += ["--stage-overrides", stage_overrides] + if extra_args_json: + extra_args += json.loads(extra_args_json) + stage_init_timeout = 600 + init_timeout = 1800 + return _build_accuracy_server_config( generate_model=request.config.getoption("gebench_model"), judge_model=request.config.getoption("accuracy_judge_model"), @@ -279,6 +391,10 @@ def gebench_accuracy_servers( port=int(request.config.getoption("gebench_port")), run_level=run_level, model_prefix=model_prefix, + generate_devices=devices_opt, + extra_generate_args=extra_args, + stage_init_timeout=stage_init_timeout, + init_timeout=init_timeout, ) diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py index 2702710e4a2..6bb8f2c3bc2 100644 --- a/tests/e2e/accuracy/test_gebench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py @@ -18,11 +18,17 @@ def test_gebench_h100_smoke( gebench_accuracy_servers, accuracy_artifact_root: Path, gebench_dataset_root: Path, + gebench_samples_per_type: int, + gebench_num_inference_steps: int, accuracy_workers: int, + gebench_t2i_only: bool, + gebench_min_scores: dict[str, float], ) -> None: model_label = infer_model_label(gebench_accuracy_servers.generate_params.model).lower() output_root = reset_artifact_dir(accuracy_artifact_root / f"gebench_{model_label}") + t2i_flag = ["--t2i-only"] if gebench_t2i_only else [] + with gebench_accuracy_servers.generate_server() as generate_server: for data_type in ("type3", "type4"): assert ( @@ -46,9 +52,12 @@ def test_gebench_h100_smoke( "--output-compression", "98", "--num-inference-steps", - "8", + str(gebench_num_inference_steps), "--workers", str(accuracy_workers), + "--samples-per-type", + str(gebench_samples_per_type), + *t2i_flag, ] ) == 0 @@ -74,6 +83,7 @@ def test_gebench_h100_smoke( "EMPTY", "--workers", str(accuracy_workers), + *t2i_flag, ] ) == 0 @@ -91,6 +101,6 @@ def test_gebench_h100_smoke( assert data_type in summary["evaluation"]["by_type"] assert summary["evaluation"]["by_type"][data_type]["count"] > 0 - assert summary["evaluation"]["overall_mean"] >= 0.45 - assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= 0.45 - assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= 0.45 + assert summary["evaluation"]["overall_mean"] >= gebench_min_scores["overall"] + assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= gebench_min_scores["type3"] + assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= gebench_min_scores["type4"]