diff --git a/vllm_omni/model_executor/stage_configs/flux2_klein_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/flux2_klein_dit_2gpu_fp8.yaml
new file mode 100644
index 00000000000..0b4ebe8efd4
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/flux2_klein_dit_2gpu_fp8.yaml
@@ -0,0 +1,30 @@
+# Stage config for running FLUX.2-klein DiT with ModelOpt FP8 auto-detect.
+# The following config is for 2 GPUs.
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      model_class_name: Flux2KleinPipeline
+      max_num_seqs: 1
+      enforce_eager: true
+      trust_remote_code: true
+      distributed_executor_backend: "mp"
+      parallel_config:
+        tensor_parallel_size: 2
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 42
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
diff --git a/vllm_omni/model_executor/stage_configs/flux_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/flux_dit_2gpu_fp8.yaml
new file mode 100644
index 00000000000..45e4ebeff3d
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/flux_dit_2gpu_fp8.yaml
@@ -0,0 +1,30 @@
+# Stage config for running FLUX.1 DiT with ModelOpt FP8 auto-detect.
+# The following config is for 2 GPUs.
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      model_class_name: FluxPipeline
+      max_num_seqs: 1
+      enforce_eager: true
+      trust_remote_code: true
+      distributed_executor_backend: "mp"
+      parallel_config:
+        tensor_parallel_size: 2
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 42
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
diff --git a/vllm_omni/model_executor/stage_configs/qwen_image_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/qwen_image_dit_2gpu_fp8.yaml
new file mode 100644
index 00000000000..1f0b60a7724
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/qwen_image_dit_2gpu_fp8.yaml
@@ -0,0 +1,30 @@
+# Stage config for running Qwen-Image DiT with ModelOpt FP8 auto-detect.
+# The following config is for 2 GPUs.
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      model_class_name: QwenImagePipeline
+      max_num_seqs: 1
+      enforce_eager: true
+      trust_remote_code: true
+      distributed_executor_backend: "mp"
+      parallel_config:
+        tensor_parallel_size: 2
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 42
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
diff --git a/vllm_omni/model_executor/stage_configs/z_image_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/z_image_dit_2gpu_fp8.yaml
new file mode 100644
index 00000000000..7d94a18cb26
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/z_image_dit_2gpu_fp8.yaml
@@ -0,0 +1,30 @@
+# Stage config for running Z-Image DiT with ModelOpt FP8 auto-detect.
+# The following config is for 2 GPUs.
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      model_class_name: ZImagePipeline
+      max_num_seqs: 1
+      enforce_eager: true
+      trust_remote_code: true
+      distributed_executor_backend: "mp"
+      parallel_config:
+        tensor_parallel_size: 2
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 42
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1