From c361bcd261c8e7918fba66bd52a02f40c5d0b8e3 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 23 Apr 2026 13:10:19 +0800
Subject: [PATCH 01/10] [CI][HunyuanImage3] add GEBench T2I accuracy test to
 nightly pipeline

Integrate tencent/HunyuanImage-3.0-Instruct GEBench (type3/type4 T2I)
accuracy test into the existing nightly buildkite pipeline, reusing the
generic gebench smoke test and extending the shared fixture to support
multi-GPU models.

Changes:
- .buildkite/test-nightly.yml: new label under nightly-diffusion-x2iat-group
  with 4x H100 podSpec (TP=4 + expert parallel), 120min timeout,
  VLLM_TEST_CLEAN_GPU_MEMORY=1 for worker cleanup; reuses
  test_gebench_h100_smoke.py via --gebench-devices / --gebench-stage-overrides
  / --gebench-extra-server-args / --gebench-num-inference-steps
- tests/e2e/accuracy/conftest.py: extend _build_accuracy_server_config and
  gebench_accuracy_servers fixture to support multi-GPU generate servers via
  new CLI options (--gebench-devices, --gebench-stage-overrides,
  --gebench-extra-server-args, --gebench-num-inference-steps); no
  model-specific fixture added
- tests/e2e/accuracy/test_gebench_h100_smoke.py: add gebench_samples_per_type
  and gebench_num_inference_steps fixture params; pass --samples-per-type and
  dynamic --num-inference-steps to gbench_main
- benchmarks/accuracy/text_to_image/gbench.py: add --t2i-only flag
  (skips IT2I edits in generate and evaluate; type1/2/5 are out of scope
  until the AR->DiT bridge lands)
- vllm_omni/diffusion/data.py: build parallel_config from individual
  kwargs (tensor_parallel_size, enable_expert_parallel) when passed via
  CLI so they aren't filtered out before DiffusionParallelConfig is built
- vllm_omni/config/pipeline_registry.py: register HUNYUAN_IMAGE3_DIT_ONLY
  as default for HF model_type "hunyuan_image_3_moe"
- vllm_omni/model_executor/models/hunyuan_image3/pipeline.py: DIT_ONLY
  topology (pure T2I path, no AR stage)
- vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py:
  walk HF cache snapshots dir to locate tokenizer.json, bypassing broken
  refs/main symlink in containerised HF cache layouts
- vllm_omni/quantization/factory.py: normalize HF quantization_config
  kwargs so AWQConfig accepts them

Validation:
End-to-end smoke on a 4x H100 node (samples-per-type=4, steps=28,
bfloat16):
  overall_mean = 0.955 (type3=0.91, type4=1.0)
All well above the 0.45 assertion threshold.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .buildkite/test-nightly.yml                   | 43 ++++++++
 benchmarks/accuracy/text_to_image/gbench.py   | 98 ++++++++++++++++---
 tests/e2e/accuracy/conftest.py                | 87 +++++++++++++---
 tests/e2e/accuracy/test_gebench_h100_smoke.py |  6 +-
 4 files changed, 210 insertions(+), 24 deletions(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index f2a765dccf8..b76a6c56ec2 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -546,6 +546,49 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
+      - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test (HunyuanImage-3.0)"
+        timeout_in_minutes: 120
+        commands:
+          # GPU memory cleanup before/after to prevent leaks from 80B MoE TP=4+EP workers.
+          # Generate (4 GPUs, TP=4+EP) and judge (1 GPU) run sequentially via fixture
+          # context managers — judge GPU 0 is reused after generate teardown.
+          - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
+          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_hunyuanimage-3_0-instruct/summary*.json"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 4
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
       - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test"
         timeout_in_minutes: 60
         commands:
diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py
index 2ea02130d6b..5914ad2b46f 100644
--- a/benchmarks/accuracy/text_to_image/gbench.py
+++ b/benchmarks/accuracy/text_to_image/gbench.py
@@ -437,6 +437,32 @@ def _build_scoring_prompt(self, task_prompt: str) -> str:
             f"{task_prompt}"
         )
 
+    def _build_t2i_scoring_prompt(self, task_prompt: str) -> str:
+        return (
+            "You are an expert evaluator for text-to-image generation quality.\n"
+            "Evaluate the single generated image against the given instruction.\n\n"
+            "Score these five dimensions from 0 to 5:\n"
+            "- goal: whether the image content matches the instruction accurately\n"
+            "- logic: whether objects, relationships and composition are correct\n"
+            "- cons: whether colors, style and lighting are internally consistent\n"
+            "- ui: whether the overall visual layout and structure looks realistic\n"
+            "- qual: whether the image is visually sharp and artifact-free\n\n"
+            "Be strict: only give 5 if the image is excellent in that dimension. "
+            "Give 3 for acceptable, 1-2 for poor, 0 for completely wrong.\n\n"
+            "Return JSON only. Do not add any prose outside JSON.\n"
+            "Use exactly this schema:\n"
+            "{\n"
+            '  "goal": 0,\n'
+            '  "logic": 0,\n'
+            '  "cons": 0,\n'
+            '  "ui": 0,\n'
+            '  "qual": 0,\n'
+            '  "reasoning": "short explanation"\n'
+            "}\n\n"
+            "Scoring task:\n"
+            f"{task_prompt}"
+        )
+
     def _request_text(self, prompt: str, images: list[Image.Image]) -> str:
         content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
         for image in images:
@@ -461,14 +487,15 @@ def _request_text(self, prompt: str, images: list[Image.Image]) -> str:
             return "\n".join(part.get("text", "") for part in message_content if part.get("type") == "text")
         return str(message_content)
 
-    def evaluate(self, *, prompt: str, images: list[Image.Image]) -> dict[str, Any]:
-        primary_prompt = self._build_scoring_prompt(prompt)
+    def evaluate(self, *, prompt: str, images: list[Image.Image], t2i_mode: bool = False) -> dict[str, Any]:
+        build = self._build_t2i_scoring_prompt if t2i_mode else self._build_scoring_prompt
+        primary_prompt = build(prompt)
         text = self._request_text(primary_prompt, images)
         try:
             return extract_json_object(text)
         except ValueError:
             retry_prompt = (
-                self._build_scoring_prompt(prompt) + "\n\nYour previous response was not valid JSON. "
+                build(prompt) + "\n\nYour previous response was not valid JSON. "
                 "Return only the JSON object with integer scores."
             )
             retry_text = self._request_text(retry_prompt, images)
@@ -500,6 +527,7 @@ def __init__(
         output_compression: int | None = 98,
         guidance_scale: float | None = None,
         seed: int | None = 42,
+        t2i_only: bool = False,
     ):
         self.dataset_root = dataset_root
         self.output_root = output_root
@@ -510,6 +538,7 @@ def __init__(
         self.output_compression = output_compression
         self.guidance_scale = guidance_scale
         self.seed = seed
+        self.t2i_only = t2i_only
         self.client = VllmOmniImageClient(base_url=base_url, api_key=api_key)
 
     def generate(
@@ -544,6 +573,8 @@ def generate(
         return results
 
     def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[str, Any] | None:
+        if self.t2i_only and data_type not in {"type3", "type4"}:
+            return None
         sample_path = sample_spec.sample_path
         metadata = sample_spec.metadata
         lang_device = sample_spec.lang_device
@@ -635,6 +666,13 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[
                 )
                 save_image(frame0_path, previous)
 
+            if self.t2i_only:
+                return {
+                    "data_type": data_type,
+                    "sample_name": f"{lang_device}/{sample_name}",
+                    "output_path": str(frame0_path),
+                }
+
             for step_num in range(1, 6):
                 frame_path = output_dir / f"frame{step_num}.png"
                 if frame_path.exists():
@@ -696,10 +734,11 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[
 
 
 class GEBenchEvaluator:
-    def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient):
+    def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient, t2i_only: bool = False):
         self.dataset_root = dataset_root
         self.output_root = output_root
         self.judge = judge
+        self.t2i_only = t2i_only
 
     def evaluate(
         self,
@@ -783,13 +822,33 @@ def _evaluate_one(self, data_type: str, sample_dir: Path, sample_spec: GEBenchSa
                 images=judge_images,
             )
         elif data_type in {"type3", "type4"}:
-            frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)]
-            instruction = _text_or_default(metadata.get("instruction") or metadata.get("caption"), "Complete the task.")
-            prompt_suffix, judge_images = _trajectory_judge_payload(frames)
-            raw_scores = self.judge.evaluate(
-                prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}",
-                images=judge_images,
-            )
+            if self.t2i_only:
+                frame0_path = sample_dir / "frame0.png"
+                if not frame0_path.exists():
+                    return None
+                generated = Image.open(frame0_path).convert("RGB")
+                instruction = _text_or_default(
+                    metadata.get("instruction") or metadata.get("caption"), "Generate an image."
+                )
+                raw_scores = self.judge.evaluate(
+                    prompt=(
+                        f"Evaluate the quality of this generated image.\n"
+                        f"Instruction: {instruction}\n"
+                        f"Rate how well the image matches the instruction."
+                    ),
+                    images=[generated],
+                    t2i_mode=True,
+                )
+            else:
+                frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)]
+                instruction = _text_or_default(
+                    metadata.get("instruction") or metadata.get("caption"), "Complete the task."
+                )
+                prompt_suffix, judge_images = _trajectory_judge_payload(frames)
+                raw_scores = self.judge.evaluate(
+                    prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}",
+                    images=judge_images,
+                )
         elif data_type == "type5":
             source = _resolve_referenced_image(
                 metadata=metadata, sample_path=dataset_sample, dataset_root=self.dataset_root, data_type=data_type
@@ -836,6 +895,12 @@ def build_parser() -> argparse.ArgumentParser:
     generate.add_argument("--workers", type=int, default=1)
     generate.add_argument("--max-samples", type=int, default=None)
     generate.add_argument("--samples-per-type", type=int, default=None)
+    generate.add_argument(
+        "--t2i-only",
+        action="store_true",
+        default=False,
+        help="Only generate T2I frame0 for type3/type4, skip IT2I edits and type1/2/5",
+    )
 
     evaluate = subparsers.add_parser("evaluate")
     evaluate.add_argument("--dataset-root", type=Path, required=True)
@@ -847,6 +912,12 @@ def build_parser() -> argparse.ArgumentParser:
     evaluate.add_argument("--workers", type=int, default=1)
     evaluate.add_argument("--max-samples", type=int, default=None)
     evaluate.add_argument("--samples-per-type", type=int, default=None)
+    evaluate.add_argument(
+        "--t2i-only",
+        action="store_true",
+        default=False,
+        help="Only evaluate frame0 for type3/type4 (matches --t2i-only in generate)",
+    )
 
     summarize = subparsers.add_parser("summarize")
     summarize.add_argument("--output-root", type=Path, required=True)
@@ -871,6 +942,7 @@ def main(argv: list[str] | None = None) -> int:
             output_compression=args.output_compression,
             guidance_scale=args.guidance_scale,
             seed=args.seed,
+            t2i_only=args.t2i_only,
         )
         records: list[dict[str, Any]] = []
         for data_type in _data_types_arg(args.data_type):
@@ -892,7 +964,9 @@ def main(argv: list[str] | None = None) -> int:
             api_key=args.judge_api_key,
             model=args.judge_model,
         )
-        evaluator = GEBenchEvaluator(dataset_root=args.dataset_root, output_root=args.output_root, judge=judge)
+        evaluator = GEBenchEvaluator(
+            dataset_root=args.dataset_root, output_root=args.output_root, judge=judge, t2i_only=args.t2i_only
+        )
         combined_results: list[dict[str, Any]] = []
         for data_type in _data_types_arg(args.data_type):
             payload = evaluator.evaluate(
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 3ff3bcc34ad..38a579fc7bf 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 import os
 import subprocess
 from contextlib import contextmanager
@@ -62,6 +63,31 @@ def pytest_addoption(parser):
         default=1200,
         help="Online serving timeout in seconds for Wan2.2 I2V accuracy tests.",
     )
+    group.addoption(
+        "--gebench-devices",
+        action="store",
+        default=None,
+        help="CUDA_VISIBLE_DEVICES for GEBench generate server (e.g. '0,1,2,3'); TP size is derived from device count",
+    )
+    group.addoption(
+        "--gebench-stage-overrides",
+        action="store",
+        default=None,
+        help="JSON string passed to --stage-overrides for GEBench generate server",
+    )
+    group.addoption(
+        "--gebench-extra-server-args",
+        action="store",
+        default=None,
+        help='JSON array of extra CLI args for GEBench generate server (e.g. \'["--dtype","bfloat16"]\')',
+    )
+    group.addoption(
+        "--gebench-num-inference-steps",
+        action="store",
+        type=int,
+        default=8,
+        help="Number of diffusion inference steps for GEBench generate",
+    )
 
 
 def _hf_cache_root() -> Path:
@@ -173,6 +199,11 @@ def gebench_samples_per_type(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gebench_samples_per_type"))
 
 
+@pytest.fixture(scope="session")
+def gebench_num_inference_steps(request: pytest.FixtureRequest) -> int:
+    return int(request.config.getoption("gebench_num_inference_steps"))
+
+
 @pytest.fixture(scope="session")
 def gedit_samples_per_group(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gedit_samples_per_group"))
@@ -229,13 +260,23 @@ def _build_accuracy_server_config(
     port: int,
     run_level: str,
     model_prefix: str,
+    generate_devices: str | None = None,
+    extra_generate_args: list[str] | None = None,
+    stage_init_timeout: int = 300,
+    init_timeout: int | None = None,
 ) -> AccuracyServerConfig:
     if torch.cuda.device_count() < 1:
         pytest.skip("Need at least 1 CUDA GPU for accuracy benchmark smoke tests.")
 
     if not generate_model:
         pytest.skip("No generate model configured for accuracy benchmark test.")
-    generate_server_args = ["--num-gpus", "1"]
+
+    devices = generate_devices or shared_gpu
+    num_devices = len([d for d in devices.split(",") if d.strip()])
+    if torch.cuda.device_count() < num_devices:
+        pytest.skip(f"Need at least {num_devices} CUDA GPUs for this accuracy benchmark.")
+
+    generate_server_args = extra_generate_args if extra_generate_args is not None else ["--num-gpus", "1"]
     judge_server_args = [
         "--max-model-len",
         "32768",
@@ -243,22 +284,24 @@ def _build_accuracy_server_config(
         "0.8",
     ]
 
-    judge_env = {"CUDA_VISIBLE_DEVICES": shared_gpu}
+    generate_params_kwargs: dict = dict(
+        model=generate_model,
+        port=port,
+        server_args=generate_server_args,
+        env_dict={"CUDA_VISIBLE_DEVICES": devices},
+        use_omni=True,
+        stage_init_timeout=stage_init_timeout,
+    )
+    if init_timeout is not None:
+        generate_params_kwargs["init_timeout"] = init_timeout
 
     return AccuracyServerConfig(
-        generate_params=OmniServerParams(
-            model=generate_model,
-            port=port,
-            server_args=generate_server_args,
-            env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu},
-            use_omni=True,
-            stage_init_timeout=300,
-        ),
+        generate_params=OmniServerParams(**generate_params_kwargs),
         judge_params=OmniServerParams(
             model=judge_model,
             port=port,
             server_args=judge_server_args,
-            env_dict=judge_env,
+            env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu},
             use_omni=False,
         ),
         run_level=run_level,
@@ -272,6 +315,24 @@ def gebench_accuracy_servers(
     run_level: str,
     model_prefix: str,
 ) -> AccuracyServerConfig:
+    devices_opt: str | None = request.config.getoption("gebench_devices")
+    stage_overrides: str | None = request.config.getoption("gebench_stage_overrides")
+    extra_args_json: str | None = request.config.getoption("gebench_extra_server_args")
+
+    extra_args: list[str] | None = None
+    stage_init_timeout = 300
+    init_timeout: int | None = None
+
+    if devices_opt:
+        num_devices = len([d for d in devices_opt.split(",") if d.strip()])
+        extra_args = ["--tensor-parallel-size", str(num_devices)]
+        if stage_overrides:
+            extra_args += ["--stage-overrides", stage_overrides]
+        if extra_args_json:
+            extra_args += json.loads(extra_args_json)
+        stage_init_timeout = 600
+        init_timeout = 1800
+
     return _build_accuracy_server_config(
         generate_model=request.config.getoption("gebench_model"),
         judge_model=request.config.getoption("accuracy_judge_model"),
@@ -279,6 +340,10 @@ def gebench_accuracy_servers(
         port=int(request.config.getoption("gebench_port")),
         run_level=run_level,
         model_prefix=model_prefix,
+        generate_devices=devices_opt,
+        extra_generate_args=extra_args,
+        stage_init_timeout=stage_init_timeout,
+        init_timeout=init_timeout,
     )
 
 
diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py
index 2702710e4a2..0f065cde711 100644
--- a/tests/e2e/accuracy/test_gebench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py
@@ -18,6 +18,8 @@ def test_gebench_h100_smoke(
     gebench_accuracy_servers,
     accuracy_artifact_root: Path,
     gebench_dataset_root: Path,
+    gebench_samples_per_type: int,
+    gebench_num_inference_steps: int,
     accuracy_workers: int,
 ) -> None:
     model_label = infer_model_label(gebench_accuracy_servers.generate_params.model).lower()
@@ -46,9 +48,11 @@ def test_gebench_h100_smoke(
                         "--output-compression",
                         "98",
                         "--num-inference-steps",
-                        "8",
+                        str(gebench_num_inference_steps),
                         "--workers",
                         str(accuracy_workers),
+                        "--samples-per-type",
+                        str(gebench_samples_per_type),
                     ]
                 )
                 == 0

From cb48b2fa8dfeb6b3fadd17cea479c872fe407e30 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 23 Apr 2026 16:10:33 +0800
Subject: [PATCH 02/10] [CI] use full_model run-level for HunyuanImage3 nightly
 GEBench

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .buildkite/test-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index b76a6c56ec2..4c16d7479cf 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -553,7 +553,7 @@ steps:
           # Generate (4 GPUs, TP=4+EP) and judge (1 GPU) run sequentially via fixture
           # context managers — judge GPU 0 is reused after generate teardown.
           - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_hunyuanimage-3_0-instruct/summary*.json"
         agents:
           queue: "mithril-h100-pool"

From ca7de9d4a69ecb50af6a498e8bd51602a1db6ea1 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 23 Apr 2026 19:33:52 +0800
Subject: [PATCH 03/10] [CI][HunyuanImage3] add --gebench-t2i-only flag for
 T2I-only models

HunyuanImage-3.0-Instruct is a T2I model that cannot do IT2I editing.
Without --t2i-only, the test generates a full 6-frame trajectory where
frames 1-5 are garbage, causing the judge to score 0.04 instead of 0.45+.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .buildkite/test-nightly.yml                   |  2 +-
 tests/e2e/accuracy/conftest.py                | 11 +++++++++++
 tests/e2e/accuracy/test_gebench_h100_smoke.py |  5 +++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 4c16d7479cf..3a8a0a8f8b8 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -553,7 +553,7 @@ steps:
           # Generate (4 GPUs, TP=4+EP) and judge (1 GPU) run sequentially via fixture
           # context managers — judge GPU 0 is reused after generate teardown.
           - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-t2i-only --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_hunyuanimage-3_0-instruct/summary*.json"
         agents:
           queue: "mithril-h100-pool"
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 38a579fc7bf..0acb153e97c 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -88,6 +88,12 @@ def pytest_addoption(parser):
         default=8,
         help="Number of diffusion inference steps for GEBench generate",
     )
+    group.addoption(
+        "--gebench-t2i-only",
+        action="store_true",
+        default=False,
+        help="Only generate/evaluate T2I frame0 for type3/type4, skip IT2I trajectory",
+    )
 
 
 def _hf_cache_root() -> Path:
@@ -204,6 +210,11 @@ def gebench_num_inference_steps(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gebench_num_inference_steps"))
 
 
+@pytest.fixture(scope="session")
+def gebench_t2i_only(request: pytest.FixtureRequest) -> bool:
+    return bool(request.config.getoption("gebench_t2i_only"))
+
+
 @pytest.fixture(scope="session")
 def gedit_samples_per_group(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gedit_samples_per_group"))
diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py
index 0f065cde711..85652b08d1c 100644
--- a/tests/e2e/accuracy/test_gebench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py
@@ -21,10 +21,13 @@ def test_gebench_h100_smoke(
     gebench_samples_per_type: int,
     gebench_num_inference_steps: int,
     accuracy_workers: int,
+    gebench_t2i_only: bool,
 ) -> None:
     model_label = infer_model_label(gebench_accuracy_servers.generate_params.model).lower()
     output_root = reset_artifact_dir(accuracy_artifact_root / f"gebench_{model_label}")
 
+    t2i_flag = ["--t2i-only"] if gebench_t2i_only else []
+
     with gebench_accuracy_servers.generate_server() as generate_server:
         for data_type in ("type3", "type4"):
             assert (
@@ -53,6 +56,7 @@ def test_gebench_h100_smoke(
                         str(accuracy_workers),
                         "--samples-per-type",
                         str(gebench_samples_per_type),
+                        *t2i_flag,
                     ]
                 )
                 == 0
@@ -78,6 +82,7 @@ def test_gebench_h100_smoke(
                         "EMPTY",
                         "--workers",
                         str(accuracy_workers),
+                        *t2i_flag,
                     ]
                 )
                 == 0

From 2996cbc098b7501d846ff73ef51792327d9d202a Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 23 Apr 2026 21:49:55 +0800
Subject: [PATCH 04/10] [CI] fix summarize fallback for t2i-only mode

collect_gebench_generation_summary hardcoded frame5.png for type3/type4,
but t2i-only mode only generates frame0.png. Fall back to find_first_image
when the expected frame doesn't exist.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 benchmarks/accuracy/text_to_image/gbench.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py
index 5914ad2b46f..0f84882d206 100644
--- a/benchmarks/accuracy/text_to_image/gbench.py
+++ b/benchmarks/accuracy/text_to_image/gbench.py
@@ -116,10 +116,8 @@ def collect_gebench_generation_summary(output_root: Path) -> dict[str, Any]:
         for lang_dir in sorted(path for path in type_root.iterdir() if path.is_dir()):
             for sample_dir in sorted(path for path in lang_dir.iterdir() if path.is_dir()):
                 expected = sample_dir / "frame5.png" if data_type in {"type2", "type3", "type4"} else None
-                if expected is None:
+                if expected is None or not expected.exists():
                     expected = find_first_image(sample_dir)
-                elif not expected.exists():
-                    expected = None
                 if expected is None:
                     continue
                 records.append(

From 2fb40bcfddeecca329203010abeeab79e0421bc9 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Sat, 25 Apr 2026 19:33:35 +0800
Subject: [PATCH 05/10] Address HunyuanImage GEBench review comments

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 benchmarks/accuracy/text_to_image/gbench.py | 2 ++
 tests/e2e/accuracy/conftest.py              | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py
index 0f84882d206..1a0f87574f3 100644
--- a/benchmarks/accuracy/text_to_image/gbench.py
+++ b/benchmarks/accuracy/text_to_image/gbench.py
@@ -117,6 +117,8 @@ def collect_gebench_generation_summary(output_root: Path) -> dict[str, Any]:
             for sample_dir in sorted(path for path in lang_dir.iterdir() if path.is_dir()):
                 expected = sample_dir / "frame5.png" if data_type in {"type2", "type3", "type4"} else None
                 if expected is None or not expected.exists():
+                    # t2i-only runs emit frame0 for type3/type4 instead of the
+                    # six-frame trajectory output, so summarize any image found.
                     expected = find_first_image(sample_dir)
                 if expected is None:
                     continue
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 0acb153e97c..3c08b0f5d7e 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -287,6 +287,15 @@ def _build_accuracy_server_config(
     if torch.cuda.device_count() < num_devices:
         pytest.skip(f"Need at least {num_devices} CUDA GPUs for this accuracy benchmark.")
 
+    if extra_generate_args is not None:
+        has_gpu_allocation_arg = any(
+            arg in {"--tensor-parallel-size", "--num-gpus"}
+            or arg.startswith("--tensor-parallel-size=")
+            or arg.startswith("--num-gpus=")
+            for arg in extra_generate_args
+        )
+        if not has_gpu_allocation_arg:
+            raise ValueError("extra_generate_args must include --tensor-parallel-size or --num-gpus")
     generate_server_args = extra_generate_args if extra_generate_args is not None else ["--num-gpus", "1"]
     judge_server_args = [
         "--max-model-len",

From a0d8f790570d0d8c635d8667984085193323a085 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 7 May 2026 15:40:18 +0800
Subject: [PATCH 06/10] chore: trigger CI re-run

Signed-off-by: TaffyOfficial <2324465096@qq.com>

From c8fab1f24ce43413393611f46bda1f4a0d6682ef Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 09:58:21 +0800
Subject: [PATCH 07/10] [CI] stabilize Hunyuan Image3 accuracy nightly

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .buildkite/test-nightly.yml                   |  6 ++--
 pyproject.toml                                |  1 +
 tests/e2e/accuracy/conftest.py                | 31 +++++++++++++++++++
 tests/e2e/accuracy/test_gebench_h100_smoke.py |  7 +++--
 .../online_serving/test_flux2_expansion.py    |  2 +-
 .../test_flux_2_dev_expansion.py              |  2 +-
 .../e2e/online_serving/test_sd3_expansion.py  |  2 +-
 7 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 3a8a0a8f8b8..156e9d05a04 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -335,7 +335,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100 and not resource_heavy" --run-level "full_model" --ignore=tests/e2e/accuracy
           # revoke after bagel optimization finished
           # Keep Bagel expansion and multi-replica tests in their dedicated H100 jobs below.
           # - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy --ignore=tests/e2e/online_serving/test_bagel_multi_replicas.py
@@ -453,7 +453,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4 and not resource_heavy" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -510,7 +510,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 --gebench-min-overall 0.35 --gebench-min-type3 0.30 --gebench-min-type4 0.35
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
         agents:
           queue: "mithril-h100-pool"
diff --git a/pyproject.toml b/pyproject.toml
index 3c3f7a0267b..3b80bace422 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -208,6 +208,7 @@ markers = [
     # more detailed markers
     "slow: Slow tests (may skip in quick CI)",
     "benchmark: Benchmark tests",
+    "resource_heavy: Full-model tests that need dedicated CI jobs instead of broad nightly sweeps",
 ]
 filterwarnings = [
     "ignore:.*does not have '__test__' attribute.*:UserWarning",
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 3c08b0f5d7e..80192f5f82f 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -94,6 +94,27 @@ def pytest_addoption(parser):
         default=False,
         help="Only generate/evaluate T2I frame0 for type3/type4, skip IT2I trajectory",
     )
+    group.addoption(
+        "--gebench-min-overall",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum overall GEBench mean score for the smoke test",
+    )
+    group.addoption(
+        "--gebench-min-type3",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum GEBench type3 mean score for the smoke test",
+    )
+    group.addoption(
+        "--gebench-min-type4",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum GEBench type4 mean score for the smoke test",
+    )
 
 
 def _hf_cache_root() -> Path:
@@ -215,6 +236,15 @@ def gebench_t2i_only(request: pytest.FixtureRequest) -> bool:
     return bool(request.config.getoption("gebench_t2i_only"))
 
 
+@pytest.fixture(scope="session")
+def gebench_min_scores(request: pytest.FixtureRequest) -> dict[str, float]:
+    return {
+        "overall": float(request.config.getoption("gebench_min_overall")),
+        "type3": float(request.config.getoption("gebench_min_type3")),
+        "type4": float(request.config.getoption("gebench_min_type4")),
+    }
+
+
 @pytest.fixture(scope="session")
 def gedit_samples_per_group(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gedit_samples_per_group"))
@@ -302,6 +332,7 @@ def _build_accuracy_server_config(
         "32768",
         "--gpu-memory-utilization",
         "0.8",
+        "--enforce-eager",
     ]
 
     generate_params_kwargs: dict = dict(
diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py
index 85652b08d1c..6bb8f2c3bc2 100644
--- a/tests/e2e/accuracy/test_gebench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py
@@ -22,6 +22,7 @@ def test_gebench_h100_smoke(
     gebench_num_inference_steps: int,
     accuracy_workers: int,
     gebench_t2i_only: bool,
+    gebench_min_scores: dict[str, float],
 ) -> None:
     model_label = infer_model_label(gebench_accuracy_servers.generate_params.model).lower()
     output_root = reset_artifact_dir(accuracy_artifact_root / f"gebench_{model_label}")
@@ -100,6 +101,6 @@ def test_gebench_h100_smoke(
         assert data_type in summary["evaluation"]["by_type"]
         assert summary["evaluation"]["by_type"][data_type]["count"] > 0
 
-    assert summary["evaluation"]["overall_mean"] >= 0.45
-    assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= 0.45
-    assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= 0.45
+    assert summary["evaluation"]["overall_mean"] >= gebench_min_scores["overall"]
+    assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= gebench_min_scores["type3"]
+    assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= gebench_min_scores["type4"]
diff --git a/tests/e2e/online_serving/test_flux2_expansion.py b/tests/e2e/online_serving/test_flux2_expansion.py
index 9a2b164b357..6c5b38935a9 100644
--- a/tests/e2e/online_serving/test_flux2_expansion.py
+++ b/tests/e2e/online_serving/test_flux2_expansion.py
@@ -14,7 +14,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
 
 FOUR_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=4)
 POSITIVE_PROMPT = "A cat sitting on a windowsill"
diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
index 953cb448a30..fc353af7e3f 100644
--- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py
+++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
@@ -17,7 +17,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
 
 MODEL = "black-forest-labs/FLUX.2-dev"
 PROMPT = "A cinematic mountain landscape at sunrise, dramatic clouds, ultra-detailed, realistic photography."
diff --git a/tests/e2e/online_serving/test_sd3_expansion.py b/tests/e2e/online_serving/test_sd3_expansion.py
index 09b50d2e501..767f4e28f5f 100644
--- a/tests/e2e/online_serving/test_sd3_expansion.py
+++ b/tests/e2e/online_serving/test_sd3_expansion.py
@@ -7,7 +7,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
 
 FOUR_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=4)
 POSITIVE_PROMPT = "A serene mountain landscape at sunset"

From ae878a85aa19f12c0b57bd7f352fd86272cc76d7 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 10:12:27 +0800
Subject: [PATCH 08/10] [CI] fix accuracy fixture accelerator lint

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 tests/e2e/accuracy/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 727e47a866d..67bac568fca 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -314,7 +314,7 @@ def _build_accuracy_server_config(
 
     devices = generate_devices or shared_gpu
     num_devices = len([d for d in devices.split(",") if d.strip()])
-    if torch.cuda.device_count() < num_devices:
+    if torch.accelerator.device_count() < num_devices:
         pytest.skip(f"Need at least {num_devices} CUDA GPUs for this accuracy benchmark.")
 
     if extra_generate_args is not None:

From 89adaf0d9090d152941b849a6af0ec94762bdae3 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 11:19:19 +0800
Subject: [PATCH 09/10] [CI] drop HunyuanImage-3.0 GEBench nightly job and
 resource_heavy gating

Revert .buildkite/test-nightly.yml to origin/main to remove:
- New HunyuanImage-3.0 GEBench accuracy job (TP=4+EP, 80B MoE)
- "and not resource_heavy" filter on the H100 / L4 diffusion sweeps
- Threshold args on the existing Qwen GEBench step

Also revert the resource_heavy marker scaffolding (pyproject.toml
registration + flux2 / flux_2_dev / sd3 expansion test tags) since it
only existed to keep the dropped HunyuanImage-3.0 job from competing
with the broad nightly diffusion sweeps.

Test cases stay: tests/e2e/accuracy/conftest.py fixture additions,
tests/e2e/accuracy/test_gebench_h100_smoke.py CLI options, and
benchmarks/accuracy/text_to_image/gbench.py logic. They can be
invoked manually until CI is re-enabled.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .buildkite/test-nightly.yml                   | 49 ++-----------------
 pyproject.toml                                |  1 -
 .../online_serving/test_flux2_expansion.py    |  2 +-
 .../test_flux_2_dev_expansion.py              |  2 +-
 .../e2e/online_serving/test_sd3_expansion.py  |  2 +-
 5 files changed, 6 insertions(+), 50 deletions(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index c894d9d152a..6dcc303ddff 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -335,7 +335,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100 and not resource_heavy" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -412,7 +412,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4 and not resource_heavy" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -469,7 +469,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 --gebench-min-overall 0.35 --gebench-min-type3 0.30 --gebench-min-type4 0.35
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
         agents:
           queue: "mithril-h100-pool"
@@ -505,49 +505,6 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
-      - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test (HunyuanImage-3.0)"
-        timeout_in_minutes: 120
-        commands:
-          # GPU memory cleanup before/after to prevent leaks from 80B MoE TP=4+EP workers.
-          # Generate (4 GPUs, TP=4+EP) and judge (1 GPU) run sequentially via fixture
-          # context managers — judge GPU 0 is reused after generate teardown.
-          - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model tencent/HunyuanImage-3.0-Instruct --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --gebench-devices 0,1,2,3 --gebench-port 8094 --accuracy-gpu 0 --gebench-samples-per-type 4 --gebench-num-inference-steps 28 --accuracy-workers 1 --gebench-t2i-only --gebench-stage-overrides '{"0":{"devices":"0,1,2,3","enable_expert_parallel":true,"max_num_seqs":1}}' --gebench-extra-server-args '["--dtype","bfloat16","--gpu-memory-utilization","0.95","--enforce-eager","--trust-remote-code","--distributed-executor-backend","mp","--no-async-chunk"]'
-          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_hunyuanimage-3_0-instruct/summary*.json"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 4
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
       - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test"
         timeout_in_minutes: 60
         commands:
diff --git a/pyproject.toml b/pyproject.toml
index 1310b8e3055..0eaf07d32d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -214,7 +214,6 @@ markers = [
     # more detailed markers
     "slow: Slow tests (may skip in quick CI)",
     "benchmark: Benchmark tests",
-    "resource_heavy: Full-model tests that need dedicated CI jobs instead of broad nightly sweeps",
 ]
 filterwarnings = [
     "ignore:.*does not have '__test__' attribute.*:UserWarning",
diff --git a/tests/e2e/online_serving/test_flux2_expansion.py b/tests/e2e/online_serving/test_flux2_expansion.py
index 6c5b38935a9..9a2b164b357 100644
--- a/tests/e2e/online_serving/test_flux2_expansion.py
+++ b/tests/e2e/online_serving/test_flux2_expansion.py
@@ -14,7 +14,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
 FOUR_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=4)
 POSITIVE_PROMPT = "A cat sitting on a windowsill"
diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
index fc353af7e3f..953cb448a30 100644
--- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py
+++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
@@ -17,7 +17,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler, dummy_messages_from_mix_data
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
 MODEL = "black-forest-labs/FLUX.2-dev"
 PROMPT = "A cinematic mountain landscape at sunrise, dramatic clouds, ultra-detailed, realistic photography."
diff --git a/tests/e2e/online_serving/test_sd3_expansion.py b/tests/e2e/online_serving/test_sd3_expansion.py
index 767f4e28f5f..09b50d2e501 100644
--- a/tests/e2e/online_serving/test_sd3_expansion.py
+++ b/tests/e2e/online_serving/test_sd3_expansion.py
@@ -7,7 +7,7 @@
 from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams, OpenAIClientHandler
 
-pytestmark = [pytest.mark.diffusion, pytest.mark.full_model, pytest.mark.resource_heavy]
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
 
 FOUR_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=4)
 POSITIVE_PROMPT = "A serene mountain landscape at sunset"

From c490c5ba6c9759b5d84df8a3d74db83e93e6202c Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 12:01:50 +0800
Subject: [PATCH 10/10] [CI][HunyuanImage3] raise GEBench inference-steps
 default 8 -> 50

The previous 8-step default targeted distilled checkpoints
(HunyuanImage-3.0-Instruct-Distil); on the full Instruct model
28 steps (the prior buildkite override) was already producing
mode-collapse samples (e.g. near-blank frames scoring 5/5 from
the judge, masked by the overall mean). HF official default for
HunyuanImage-3.0 is 50 steps; align defaults with that.

Distilled / fast-sampling models that want fewer steps must now
opt in explicitly via --gebench-num-inference-steps / --num-inference-steps.

Sites updated:
- tests/e2e/accuracy/conftest.py: pytest --gebench-num-inference-steps default
- benchmarks/accuracy/text_to_image/gbench.py: GEBenchRunner.__init__ + CLI default

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 benchmarks/accuracy/text_to_image/gbench.py | 4 ++--
 tests/e2e/accuracy/conftest.py              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py
index 1a0f87574f3..aa26b1fcc0b 100644
--- a/benchmarks/accuracy/text_to_image/gbench.py
+++ b/benchmarks/accuracy/text_to_image/gbench.py
@@ -523,7 +523,7 @@ def __init__(
         api_key: str = "EMPTY",
         width: int = 768,
         height: int = 576,
-        num_inference_steps: int = 8,
+        num_inference_steps: int = 50,
         output_compression: int | None = 98,
         guidance_scale: float | None = None,
         seed: int | None = 42,
@@ -888,7 +888,7 @@ def build_parser() -> argparse.ArgumentParser:
     generate.add_argument("--api-key", type=str, default="EMPTY")
     generate.add_argument("--width", type=int, default=768)
     generate.add_argument("--height", type=int, default=576)
-    generate.add_argument("--num-inference-steps", type=int, default=8)
+    generate.add_argument("--num-inference-steps", type=int, default=50)
     generate.add_argument("--output-compression", type=int, default=98)
     generate.add_argument("--guidance-scale", type=float, default=None)
     generate.add_argument("--seed", type=int, default=42)
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index 67bac568fca..3328995faf3 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -85,7 +85,7 @@ def pytest_addoption(parser):
         "--gebench-num-inference-steps",
         action="store",
         type=int,
-        default=8,
+        default=50,
         help="Number of diffusion inference steps for GEBench generate",
     )
     group.addoption(