diff --git a/benchmarks/accuracy/text_to_image/gbench.py b/benchmarks/accuracy/text_to_image/gbench.py
index 2ea02130d6b..aa26b1fcc0b 100644
--- a/benchmarks/accuracy/text_to_image/gbench.py
+++ b/benchmarks/accuracy/text_to_image/gbench.py
@@ -116,10 +116,10 @@ def collect_gebench_generation_summary(output_root: Path) -> dict[str, Any]:
         for lang_dir in sorted(path for path in type_root.iterdir() if path.is_dir()):
             for sample_dir in sorted(path for path in lang_dir.iterdir() if path.is_dir()):
                 expected = sample_dir / "frame5.png" if data_type in {"type2", "type3", "type4"} else None
-                if expected is None:
+                if expected is None or not expected.exists():
+                    # t2i-only runs emit frame0 for type3/type4 instead of the
+                    # six-frame trajectory output, so summarize any image found.
                     expected = find_first_image(sample_dir)
-                elif not expected.exists():
-                    expected = None
                 if expected is None:
                     continue
                 records.append(
@@ -437,6 +437,32 @@ def _build_scoring_prompt(self, task_prompt: str) -> str:
             f"{task_prompt}"
         )
 
+    def _build_t2i_scoring_prompt(self, task_prompt: str) -> str:
+        return (
+            "You are an expert evaluator for text-to-image generation quality.\n"
+            "Evaluate the single generated image against the given instruction.\n\n"
+            "Score these five dimensions from 0 to 5:\n"
+            "- goal: whether the image content matches the instruction accurately\n"
+            "- logic: whether objects, relationships and composition are correct\n"
+            "- cons: whether colors, style and lighting are internally consistent\n"
+            "- ui: whether the overall visual layout and structure looks realistic\n"
+            "- qual: whether the image is visually sharp and artifact-free\n\n"
+            "Be strict: only give 5 if the image is excellent in that dimension. "
+            "Give 3 for acceptable, 1-2 for poor, 0 for completely wrong.\n\n"
+            "Return JSON only. Do not add any prose outside JSON.\n"
+            "Use exactly this schema:\n"
+            "{\n"
+            '  "goal": 0,\n'
+            '  "logic": 0,\n'
+            '  "cons": 0,\n'
+            '  "ui": 0,\n'
+            '  "qual": 0,\n'
+            '  "reasoning": "short explanation"\n'
+            "}\n\n"
+            "Scoring task:\n"
+            f"{task_prompt}"
+        )
+
     def _request_text(self, prompt: str, images: list[Image.Image]) -> str:
         content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
         for image in images:
@@ -461,14 +487,15 @@ def _request_text(self, prompt: str, images: list[Image.Image]) -> str:
             return "\n".join(part.get("text", "") for part in message_content if part.get("type") == "text")
         return str(message_content)
 
-    def evaluate(self, *, prompt: str, images: list[Image.Image]) -> dict[str, Any]:
-        primary_prompt = self._build_scoring_prompt(prompt)
+    def evaluate(self, *, prompt: str, images: list[Image.Image], t2i_mode: bool = False) -> dict[str, Any]:
+        build = self._build_t2i_scoring_prompt if t2i_mode else self._build_scoring_prompt
+        primary_prompt = build(prompt)
         text = self._request_text(primary_prompt, images)
         try:
             return extract_json_object(text)
         except ValueError:
             retry_prompt = (
-                self._build_scoring_prompt(prompt) + "\n\nYour previous response was not valid JSON. "
+                build(prompt) + "\n\nYour previous response was not valid JSON. "
                 "Return only the JSON object with integer scores."
             )
             retry_text = self._request_text(retry_prompt, images)
@@ -496,10 +523,11 @@ def __init__(
         api_key: str = "EMPTY",
         width: int = 768,
         height: int = 576,
-        num_inference_steps: int = 8,
+        num_inference_steps: int = 50,
         output_compression: int | None = 98,
         guidance_scale: float | None = None,
         seed: int | None = 42,
+        t2i_only: bool = False,
     ):
         self.dataset_root = dataset_root
         self.output_root = output_root
@@ -510,6 +538,7 @@ def __init__(
         self.output_compression = output_compression
         self.guidance_scale = guidance_scale
         self.seed = seed
+        self.t2i_only = t2i_only
         self.client = VllmOmniImageClient(base_url=base_url, api_key=api_key)
 
     def generate(
@@ -544,6 +573,8 @@ def generate(
         return results
 
     def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[str, Any] | None:
+        if self.t2i_only and data_type not in {"type3", "type4"}:
+            return None
         sample_path = sample_spec.sample_path
         metadata = sample_spec.metadata
         lang_device = sample_spec.lang_device
@@ -635,6 +666,13 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[
                 )
                 save_image(frame0_path, previous)
 
+            if self.t2i_only:
+                return {
+                    "data_type": data_type,
+                    "sample_name": f"{lang_device}/{sample_name}",
+                    "output_path": str(frame0_path),
+                }
+
             for step_num in range(1, 6):
                 frame_path = output_dir / f"frame{step_num}.png"
                 if frame_path.exists():
@@ -696,10 +734,11 @@ def _generate_one(self, data_type: str, sample_spec: GEBenchSampleSpec) -> dict[
 
 
 class GEBenchEvaluator:
-    def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient):
+    def __init__(self, *, dataset_root: Path, output_root: Path, judge: LocalJudgeClient, t2i_only: bool = False):
         self.dataset_root = dataset_root
         self.output_root = output_root
         self.judge = judge
+        self.t2i_only = t2i_only
 
     def evaluate(
         self,
@@ -783,13 +822,33 @@ def _evaluate_one(self, data_type: str, sample_dir: Path, sample_spec: GEBenchSa
                 images=judge_images,
             )
         elif data_type in {"type3", "type4"}:
-            frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)]
-            instruction = _text_or_default(metadata.get("instruction") or metadata.get("caption"), "Complete the task.")
-            prompt_suffix, judge_images = _trajectory_judge_payload(frames)
-            raw_scores = self.judge.evaluate(
-                prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}",
-                images=judge_images,
-            )
+            if self.t2i_only:
+                frame0_path = sample_dir / "frame0.png"
+                if not frame0_path.exists():
+                    return None
+                generated = Image.open(frame0_path).convert("RGB")
+                instruction = _text_or_default(
+                    metadata.get("instruction") or metadata.get("caption"), "Generate an image."
+                )
+                raw_scores = self.judge.evaluate(
+                    prompt=(
+                        f"Evaluate the quality of this generated image.\n"
+                        f"Instruction: {instruction}\n"
+                        f"Rate how well the image matches the instruction."
+                    ),
+                    images=[generated],
+                    t2i_mode=True,
+                )
+            else:
+                frames = [Image.open(sample_dir / f"frame{i}.png").convert("RGB") for i in range(6)]
+                instruction = _text_or_default(
+                    metadata.get("instruction") or metadata.get("caption"), "Complete the task."
+                )
+                prompt_suffix, judge_images = _trajectory_judge_payload(frames)
+                raw_scores = self.judge.evaluate(
+                    prompt=f"Evaluate a six-frame GUI trajectory.\nInstruction: {instruction}\n{prompt_suffix}",
+                    images=judge_images,
+                )
         elif data_type == "type5":
             source = _resolve_referenced_image(
                 metadata=metadata, sample_path=dataset_sample, dataset_root=self.dataset_root, data_type=data_type
@@ -829,13 +888,19 @@ def build_parser() -> argparse.ArgumentParser:
     generate.add_argument("--api-key", type=str, default="EMPTY")
     generate.add_argument("--width", type=int, default=768)
     generate.add_argument("--height", type=int, default=576)
-    generate.add_argument("--num-inference-steps", type=int, default=8)
+    generate.add_argument("--num-inference-steps", type=int, default=50)
     generate.add_argument("--output-compression", type=int, default=98)
     generate.add_argument("--guidance-scale", type=float, default=None)
     generate.add_argument("--seed", type=int, default=42)
     generate.add_argument("--workers", type=int, default=1)
     generate.add_argument("--max-samples", type=int, default=None)
     generate.add_argument("--samples-per-type", type=int, default=None)
+    generate.add_argument(
+        "--t2i-only",
+        action="store_true",
+        default=False,
+        help="Only generate T2I frame0 for type3/type4, skip IT2I edits and type1/2/5",
+    )
 
     evaluate = subparsers.add_parser("evaluate")
     evaluate.add_argument("--dataset-root", type=Path, required=True)
@@ -847,6 +912,12 @@ def build_parser() -> argparse.ArgumentParser:
     evaluate.add_argument("--workers", type=int, default=1)
     evaluate.add_argument("--max-samples", type=int, default=None)
     evaluate.add_argument("--samples-per-type", type=int, default=None)
+    evaluate.add_argument(
+        "--t2i-only",
+        action="store_true",
+        default=False,
+        help="Only evaluate frame0 for type3/type4 (matches --t2i-only in generate)",
+    )
 
     summarize = subparsers.add_parser("summarize")
     summarize.add_argument("--output-root", type=Path, required=True)
@@ -871,6 +942,7 @@ def main(argv: list[str] | None = None) -> int:
             output_compression=args.output_compression,
             guidance_scale=args.guidance_scale,
             seed=args.seed,
+            t2i_only=args.t2i_only,
         )
         records: list[dict[str, Any]] = []
         for data_type in _data_types_arg(args.data_type):
@@ -892,7 +964,9 @@ def main(argv: list[str] | None = None) -> int:
             api_key=args.judge_api_key,
             model=args.judge_model,
         )
-        evaluator = GEBenchEvaluator(dataset_root=args.dataset_root, output_root=args.output_root, judge=judge)
+        evaluator = GEBenchEvaluator(
+            dataset_root=args.dataset_root, output_root=args.output_root, judge=judge, t2i_only=args.t2i_only
+        )
         combined_results: list[dict[str, Any]] = []
         for data_type in _data_types_arg(args.data_type):
             payload = evaluator.evaluate(
diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py
index a4804fc1980..3328995faf3 100644
--- a/tests/e2e/accuracy/conftest.py
+++ b/tests/e2e/accuracy/conftest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 import os
 import subprocess
 from contextlib import contextmanager
@@ -62,6 +63,58 @@ def pytest_addoption(parser):
         default=1200,
         help="Online serving timeout in seconds for Wan2.2 I2V accuracy tests.",
     )
+    group.addoption(
+        "--gebench-devices",
+        action="store",
+        default=None,
+        help="CUDA_VISIBLE_DEVICES for GEBench generate server (e.g. '0,1,2,3'); TP size is derived from device count",
+    )
+    group.addoption(
+        "--gebench-stage-overrides",
+        action="store",
+        default=None,
+        help="JSON string passed to --stage-overrides for GEBench generate server",
+    )
+    group.addoption(
+        "--gebench-extra-server-args",
+        action="store",
+        default=None,
+        help='JSON array of extra CLI args for GEBench generate server (e.g. \'["--dtype","bfloat16"]\')',
+    )
+    group.addoption(
+        "--gebench-num-inference-steps",
+        action="store",
+        type=int,
+        default=50,
+        help="Number of diffusion inference steps for GEBench generate",
+    )
+    group.addoption(
+        "--gebench-t2i-only",
+        action="store_true",
+        default=False,
+        help="Only generate/evaluate T2I frame0 for type3/type4, skip IT2I trajectory",
+    )
+    group.addoption(
+        "--gebench-min-overall",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum overall GEBench mean score for the smoke test",
+    )
+    group.addoption(
+        "--gebench-min-type3",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum GEBench type3 mean score for the smoke test",
+    )
+    group.addoption(
+        "--gebench-min-type4",
+        action="store",
+        type=float,
+        default=0.45,
+        help="Minimum GEBench type4 mean score for the smoke test",
+    )
 
 
 def _hf_cache_root() -> Path:
@@ -173,6 +226,25 @@ def gebench_samples_per_type(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gebench_samples_per_type"))
 
 
+@pytest.fixture(scope="session")
+def gebench_num_inference_steps(request: pytest.FixtureRequest) -> int:
+    return int(request.config.getoption("gebench_num_inference_steps"))
+
+
+@pytest.fixture(scope="session")
+def gebench_t2i_only(request: pytest.FixtureRequest) -> bool:
+    return bool(request.config.getoption("gebench_t2i_only"))
+
+
+@pytest.fixture(scope="session")
+def gebench_min_scores(request: pytest.FixtureRequest) -> dict[str, float]:
+    return {
+        "overall": float(request.config.getoption("gebench_min_overall")),
+        "type3": float(request.config.getoption("gebench_min_type3")),
+        "type4": float(request.config.getoption("gebench_min_type4")),
+    }
+
+
 @pytest.fixture(scope="session")
 def gedit_samples_per_group(request: pytest.FixtureRequest) -> int:
     return int(request.config.getoption("gedit_samples_per_group"))
@@ -229,36 +301,58 @@ def _build_accuracy_server_config(
     port: int,
     run_level: str,
     model_prefix: str,
+    generate_devices: str | None = None,
+    extra_generate_args: list[str] | None = None,
+    stage_init_timeout: int = 300,
+    init_timeout: int | None = None,
 ) -> AccuracyServerConfig:
     if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 CUDA GPU for accuracy benchmark smoke tests.")
 
     if not generate_model:
         pytest.skip("No generate model configured for accuracy benchmark test.")
-    generate_server_args = ["--num-gpus", "1"]
+
+    devices = generate_devices or shared_gpu
+    num_devices = len([d for d in devices.split(",") if d.strip()])
+    if torch.accelerator.device_count() < num_devices:
+        pytest.skip(f"Need at least {num_devices} CUDA GPUs for this accuracy benchmark.")
+
+    if extra_generate_args is not None:
+        has_gpu_allocation_arg = any(
+            arg in {"--tensor-parallel-size", "--num-gpus"}
+            or arg.startswith("--tensor-parallel-size=")
+            or arg.startswith("--num-gpus=")
+            for arg in extra_generate_args
+        )
+        if not has_gpu_allocation_arg:
+            raise ValueError("extra_generate_args must include --tensor-parallel-size or --num-gpus")
+    generate_server_args = extra_generate_args if extra_generate_args is not None else ["--num-gpus", "1"]
     judge_server_args = [
         "--max-model-len",
         "32768",
         "--gpu-memory-utilization",
         "0.8",
+        "--enforce-eager",
     ]
 
-    judge_env = {"CUDA_VISIBLE_DEVICES": shared_gpu}
+    generate_params_kwargs: dict = dict(
+        model=generate_model,
+        port=port,
+        server_args=generate_server_args,
+        env_dict={"CUDA_VISIBLE_DEVICES": devices},
+        use_omni=True,
+        stage_init_timeout=stage_init_timeout,
+    )
+    if init_timeout is not None:
+        generate_params_kwargs["init_timeout"] = init_timeout
 
     return AccuracyServerConfig(
-        generate_params=OmniServerParams(
-            model=generate_model,
-            port=port,
-            server_args=generate_server_args,
-            env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu},
-            use_omni=True,
-            stage_init_timeout=300,
-        ),
+        generate_params=OmniServerParams(**generate_params_kwargs),
         judge_params=OmniServerParams(
             model=judge_model,
             port=port,
             server_args=judge_server_args,
-            env_dict=judge_env,
+            env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu},
             use_omni=False,
         ),
         run_level=run_level,
@@ -272,6 +366,24 @@ def gebench_accuracy_servers(
     run_level: str,
     model_prefix: str,
 ) -> AccuracyServerConfig:
+    devices_opt: str | None = request.config.getoption("gebench_devices")
+    stage_overrides: str | None = request.config.getoption("gebench_stage_overrides")
+    extra_args_json: str | None = request.config.getoption("gebench_extra_server_args")
+
+    extra_args: list[str] | None = None
+    stage_init_timeout = 300
+    init_timeout: int | None = None
+
+    if devices_opt:
+        num_devices = len([d for d in devices_opt.split(",") if d.strip()])
+        extra_args = ["--tensor-parallel-size", str(num_devices)]
+        if stage_overrides:
+            extra_args += ["--stage-overrides", stage_overrides]
+        if extra_args_json:
+            extra_args += json.loads(extra_args_json)
+        stage_init_timeout = 600
+        init_timeout = 1800
+
     return _build_accuracy_server_config(
         generate_model=request.config.getoption("gebench_model"),
         judge_model=request.config.getoption("accuracy_judge_model"),
@@ -279,6 +391,10 @@ def gebench_accuracy_servers(
         port=int(request.config.getoption("gebench_port")),
         run_level=run_level,
         model_prefix=model_prefix,
+        generate_devices=devices_opt,
+        extra_generate_args=extra_args,
+        stage_init_timeout=stage_init_timeout,
+        init_timeout=init_timeout,
     )
 
 
diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py
index 2702710e4a2..6bb8f2c3bc2 100644
--- a/tests/e2e/accuracy/test_gebench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py
@@ -18,11 +18,17 @@ def test_gebench_h100_smoke(
     gebench_accuracy_servers,
     accuracy_artifact_root: Path,
     gebench_dataset_root: Path,
+    gebench_samples_per_type: int,
+    gebench_num_inference_steps: int,
     accuracy_workers: int,
+    gebench_t2i_only: bool,
+    gebench_min_scores: dict[str, float],
 ) -> None:
     model_label = infer_model_label(gebench_accuracy_servers.generate_params.model).lower()
     output_root = reset_artifact_dir(accuracy_artifact_root / f"gebench_{model_label}")
 
+    t2i_flag = ["--t2i-only"] if gebench_t2i_only else []
+
     with gebench_accuracy_servers.generate_server() as generate_server:
         for data_type in ("type3", "type4"):
             assert (
@@ -46,9 +52,12 @@ def test_gebench_h100_smoke(
                         "--output-compression",
                         "98",
                         "--num-inference-steps",
-                        "8",
+                        str(gebench_num_inference_steps),
                         "--workers",
                         str(accuracy_workers),
+                        "--samples-per-type",
+                        str(gebench_samples_per_type),
+                        *t2i_flag,
                     ]
                 )
                 == 0
@@ -74,6 +83,7 @@ def test_gebench_h100_smoke(
                         "EMPTY",
                         "--workers",
                         str(accuracy_workers),
+                        *t2i_flag,
                     ]
                 )
                 == 0
@@ -91,6 +101,6 @@ def test_gebench_h100_smoke(
         assert data_type in summary["evaluation"]["by_type"]
         assert summary["evaluation"]["by_type"][data_type]["count"] > 0
 
-    assert summary["evaluation"]["overall_mean"] >= 0.45
-    assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= 0.45
-    assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= 0.45
+    assert summary["evaluation"]["overall_mean"] >= gebench_min_scores["overall"]
+    assert summary["evaluation"]["by_type"]["type3"]["overall_mean"] >= gebench_min_scores["type3"]
+    assert summary["evaluation"]["by_type"]["type4"]["overall_mean"] >= gebench_min_scores["type4"]