diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 33f53fc0e150..9eff3a9de59d 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -322,9 +322,11 @@ jobs:
 
       - name: Clean Corrupted Hugging Face Model Cache
         run: |
-          echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download."
+          echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download. This is temporary"
           rm -rf /hf_home/hub/models--Qwen--Qwen-Image
           rm -rf /hf_home/hub/models--Qwen--Qwen-Image-Edit
+          rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-I2V-A14B-Diffusers
+          rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-TI2V-5B-Diffusers
 
       - name: Run diffusion server tests
         timeout-minutes: 60
@@ -1006,7 +1008,6 @@ jobs:
               exit 1
             fi
           done
-
           # If the loop completes, all jobs were successful
           echo "All jobs completed successfully"
           exit 0
diff --git a/python/sglang/multimodal_gen/configs/pipelines/base.py b/python/sglang/multimodal_gen/configs/pipelines/base.py
index 8e6039bd7f1c..bbde1570ebde 100644
--- a/python/sglang/multimodal_gen/configs/pipelines/base.py
+++ b/python/sglang/multimodal_gen/configs/pipelines/base.py
@@ -37,7 +37,7 @@ class ModelTaskType(Enum):
     T2I = auto()  # Text to Image
     I2I = auto()  # Image to Image
 
-    def is_image_task(self):
+    def is_image_gen(self):
         return self == ModelTaskType.T2I or self == ModelTaskType.I2I
 
 
@@ -134,12 +134,15 @@ def postprocess_image(self, image):
     def slice_noise_pred(self, noise, latents):
         return noise
 
-    def set_width_and_height(self, width, height, image):
+    def adjust_size(self, width, height, image):
         """
         image: input image
         """
         return width, height
 
+    def adjust_num_frames(self, num_frames):
+        return num_frames
+
     # called in ImageEncodingStage, preprocess the image
     def preprocess_image(self, image, image_processor: VaeImageProcessor):
         return image
diff --git a/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py b/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py
index 48de3d41a19d..54c85bdcbdd9 100644
--- a/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py
+++ b/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py
@@ -273,7 +273,7 @@ def preprocess_image(self, image, image_processor):
         image = image_processor.resize(image, calculated_height, calculated_width)
         return image
 
-    def set_width_and_height(self, width, height, image):
+    def adjust_size(self, width, height, image):
         image_size = image[0].size if isinstance(image, list) else image.size
         calculated_width, calculated_height, _ = calculate_dimensions(
             1024 * 1024, image_size[0] / image_size[1]
diff --git a/python/sglang/multimodal_gen/configs/pipelines/wan.py b/python/sglang/multimodal_gen/configs/pipelines/wan.py
index d5efa2d64f71..d7068fba7a05 100644
--- a/python/sglang/multimodal_gen/configs/pipelines/wan.py
+++ b/python/sglang/multimodal_gen/configs/pipelines/wan.py
@@ -15,6 +15,9 @@
 )
 from sglang.multimodal_gen.configs.models.vaes import WanVAEConfig
 from sglang.multimodal_gen.configs.pipelines.base import ModelTaskType, PipelineConfig
+from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger
+
+logger = init_logger(__name__)
 
 
 def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tensor:
@@ -33,6 +36,22 @@ def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tenso
     return prompt_embeds_tensor
 
 
+@dataclass
+class WanI2VCommonConfig(PipelineConfig):
+    # for all wan i2v pipelines
+    def adjust_num_frames(self, num_frames):
+        vae_scale_factor_temporal = self.vae_config.arch_config.scale_factor_temporal
+        if num_frames % vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = (
+                num_frames // vae_scale_factor_temporal * vae_scale_factor_temporal + 1
+            )
+            return num_frames
+        return num_frames
+
+
 @dataclass
 class WanT2V480PConfig(PipelineConfig):
     """Base configuration for Wan T2V 1.3B pipeline architecture."""
@@ -81,7 +100,7 @@ class WanT2V720PConfig(WanT2V480PConfig):
 
 
 @dataclass
-class WanI2V480PConfig(WanT2V480PConfig):
+class WanI2V480PConfig(WanT2V480PConfig, WanI2VCommonConfig):
     """Base configuration for Wan I2V 14B 480P pipeline architecture."""
 
     # WanConfig-specific parameters with defaults
@@ -128,7 +147,7 @@ class FastWan2_1_T2V_480P_Config(WanT2V480PConfig):
 
 
 @dataclass
-class Wan2_2_TI2V_5B_Config(WanT2V480PConfig):
+class Wan2_2_TI2V_5B_Config(WanT2V480PConfig, WanI2VCommonConfig):
     flow_shift: float | None = 5.0
     task_type: ModelTaskType = ModelTaskType.TI2V
     expand_timesteps: bool = True
diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py b/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py
index 962a880ad452..1c376634a9c3 100644
--- a/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py
+++ b/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py
@@ -259,7 +259,7 @@ def generate(
         # TODO: simplify
         data_type = (
             DataType.IMAGE
-            if self.server_args.pipeline_config.task_type.is_image_task()
+            if self.server_args.pipeline_config.task_type.is_image_gen()
             or pretrained_sampling_params.num_frames == 1
             else DataType.VIDEO
         )
diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/utils.py b/python/sglang/multimodal_gen/runtime/entrypoints/utils.py
index 62e5c842d170..a4f8744697fb 100644
--- a/python/sglang/multimodal_gen/runtime/entrypoints/utils.py
+++ b/python/sglang/multimodal_gen/runtime/entrypoints/utils.py
@@ -45,12 +45,12 @@ def prepare_sampling_params(
     # Validate dimensions
     if sampling_params.num_frames <= 0:
         raise ValueError(
-            f"Height, width, and num_frames must be positive integers, got "
+            f"height, width, and num_frames must be positive integers, got "
             f"height={sampling_params.height}, width={sampling_params.width}, "
             f"num_frames={sampling_params.num_frames}"
         )
 
-    if pipeline_config.task_type.is_image_task():
+    if pipeline_config.task_type.is_image_gen():
         # settle num_frames
         logger.debug(f"Setting num_frames to 1 because this is a image-gen model")
         sampling_params.num_frames = 1
@@ -104,6 +104,10 @@ def prepare_sampling_params(
             )
             sampling_params.num_frames = new_num_frames
 
+        sampling_params.num_frames = server_args.pipeline_config.adjust_num_frames(
+            sampling_params.num_frames
+        )
+
     sampling_params.set_output_file_ext()
     sampling_params.log(server_args=server_args)
     return sampling_params
diff --git a/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py b/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py
index 06dded31bc3f..ef60d8f5ec7f 100644
--- a/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py
+++ b/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py
@@ -235,7 +235,7 @@ def __post_init__(self):
 
     def set_width_and_height(self, server_args: ServerArgs):
         if self.height is None or self.width is None:
-            width, height = server_args.pipeline_config.set_width_and_height(
+            width, height = server_args.pipeline_config.adjust_size(
                 self.width, self.height, self.pil_image
             )
             self.width = width
diff --git a/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py b/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py
index 49a618055261..b992c7791b74 100644
--- a/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py
+++ b/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py
@@ -123,7 +123,7 @@ def forward(
         if isinstance(server_args.pipeline_config, QwenImageEditPipelineConfig):
             height = None if batch.height_not_provided else batch.height
             width = None if batch.width_not_provided else batch.width
-            width, height = server_args.pipeline_config.set_width_and_height(
+            width, height = server_args.pipeline_config.adjust_size(
                 height, width, batch.pil_image
             )
             batch.width = width
diff --git a/python/sglang/multimodal_gen/runtime/server_args.py b/python/sglang/multimodal_gen/runtime/server_args.py
index 43ae2aff6539..71de931d32b5 100644
--- a/python/sglang/multimodal_gen/runtime/server_args.py
+++ b/python/sglang/multimodal_gen/runtime/server_args.py
@@ -803,7 +803,7 @@ def get_provided_args(
 
     def check_server_sp_args(self):
 
-        if self.pipeline_config.task_type.is_image_task():
+        if self.pipeline_config.task_type.is_image_gen():
             if (
                 (self.sp_degree and self.sp_degree > 1)
                 or (self.ulysses_degree and self.ulysses_degree > 1)
diff --git a/python/sglang/multimodal_gen/test/server/diffusion_config.py b/python/sglang/multimodal_gen/test/server/diffusion_config.py
index 16381c608897..11131a2979ee 100644
--- a/python/sglang/multimodal_gen/test/server/diffusion_config.py
+++ b/python/sglang/multimodal_gen/test/server/diffusion_config.py
@@ -180,22 +180,22 @@ class PerformanceSummary:
         startup_grace_seconds=30.0,
         custom_validator="video",
     ),
-    # # === Image to Video (I2V) ===
-    # DiffusionCase(
-    #     id="wan2_1_i2v_480p",
-    #     model_path="Wan-AI/Wan2.1-I2V-14B-Diffusers",
-    #     scenario_name="image_to_video",
-    #     modality="video",
-    #     prompt="generate", # passing in something since failing if no prompt is passed
-    #     warmup_text=0, # warmups only for image gen models
-    #     warmup_edit=0,
-    #     output_size="1024x1536",
-    #     image_edit_prompt="generate",
-    #     image_edit_path="https://github.com/lm-sys/lm-sys.github.io/releases/download/test/TI2I_Qwen_Image_Edit_Input.jpg",
-    #     startup_grace_seconds=30.0,
-    #     custom_validator="video",
-    #     seconds=4,
-    # ),
+    # === Image to Video (I2V) ===
+    DiffusionCase(
+        id="wan2_2_i2v",
+        model_path="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+        scenario_name="image_to_video",
+        modality="video",
+        prompt="generate",  # passing in something since failing if no prompt is passed
+        warmup_text=0,  # warmups only for image gen models
+        warmup_edit=0,
+        output_size="832x1104",
+        image_edit_prompt="generate",
+        image_edit_path="https://github.com/Wan-Video/Wan2.2/blob/990af50de458c19590c245151197326e208d7191/examples/i2v_input.JPG?raw=true",
+        startup_grace_seconds=30.0,
+        custom_validator="video",
+        seconds=1,
+    ),
     # === Text and Image to Video (TI2V) ===
     DiffusionCase(
         id="wan2_2_ti2v_5b",
diff --git a/python/sglang/multimodal_gen/test/server/perf_baselines.json b/python/sglang/multimodal_gen/test/server/perf_baselines.json
index 473343c9dab1..cfb0c5ccbaf3 100644
--- a/python/sglang/multimodal_gen/test/server/perf_baselines.json
+++ b/python/sglang/multimodal_gen/test/server/perf_baselines.json
@@ -1,142 +1,155 @@
-{
-  "metadata": {
-    "model": "Diffusion Server",
-    "hardware": "CI H100 80GB pool",
-    "description": "Reference numbers captured from the CI diffusion server baseline run"
-  },
-  "tolerances": {
-    "e2e": 0.25,
-    "stage": 0.3,
-    "denoise_step": 0.1,
-    "denoise_agg": 0.1
-  },
-  "sampling": {
-    "step_fractions": [
-      0.0,
-      0.2,
-      0.4,
-      0.6,
-      0.8,
-      1.0
-    ],
-    "warmup_requests": {
-      "text": 1,
-      "image_edit": 0
-    }
-  },
-  "scenarios": {
-    "text_to_image": {
-      "notes": "Single-image generation using the default prompt",
-      "expected_e2e_ms": 74500.0,
-      "expected_avg_denoise_ms": 422.42,
-      "expected_median_denoise_ms": 410.62,
-      "stages_ms": {
-        "InputValidationStage": 0.1,
-        "TextEncodingStage": 834.2,
-        "ConditioningStage": 0.1,
-        "TimestepPreparationStage": 10.6,
-        "LatentPreparationStage": 9.0,
-        "DenoisingStage": 21202.6,
-        "DecodingStage": 476.12
-      },
-      "denoise_step_ms": {
-        "0": 1077.77, "1": 345.13, "2": 413.8, "3": 405.49, "4": 408.14, "5": 409.06,
-        "6": 408.85, "7": 410.53, "8": 407.51, "9": 409.44, "10": 408.65, "11": 410.14,
-        "12": 411.74, "13": 409.59, "14": 409.17, "15": 410.78, "16": 410.66, "17": 410.58,
-        "18": 411.27, "19": 410.51, "20": 409.03, "21": 410.16, "22": 409.42, "23": 411.03,
-        "24": 410.18, "25": 409.72, "26": 410.26, "27": 410.21, "28": 410.71, "29": 410.76,
-        "30": 411.06, "31": 410.1, "32": 410.55, "33": 410.77, "34": 410.74, "35": 411.75,
-        "36": 410.78, "37": 411.56, "38": 410.85, "39": 411.08, "40": 411.12, "41": 411.1,
-        "42": 411.09, "43": 410.87, "44": 411.37, "45": 411.68, "46": 411.0, "47": 410.09,
-        "48": 412.72, "49": 410.42
-      }
-    },
-    "image_edit": {
-      "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit",
-      "expected_e2e_ms": 138500.0,
-      "expected_avg_denoise_ms": 720.0,
-      "expected_median_denoise_ms": 718.0,
-      "stages_ms": {
-        "InputValidationStage": 23,
-        "ImageEncodingStage": 990.0,
-        "ImageVAEEncodingStage": 340.0,
-        "ConditioningStage": 0.13,
-        "TimestepPreparationStage": 13.78,
-        "LatentPreparationStage": 10.0,
-        "DenoisingStage": 36000.0,
-        "DecodingStage": 645
-      },
-      "denoise_step_ms": {
-        "0": 720.0, "1": 720.0, "2": 720.0, "3": 720.0, "4": 720.0, "5": 720.0,
-        "6": 720.0, "7": 720.0, "8": 720.0, "9": 720.0, "10": 720.0, "11": 720.0,
-        "12": 720.0, "13": 720.0, "14": 720.0, "15": 720.0, "16": 720.0, "17": 720.0,
-        "18": 720.0, "19": 720.0, "20": 720.0, "21": 720.0, "22": 720.0, "23": 720.0,
-        "24": 720.0, "25": 720.0, "26": 720.0, "27": 720.0, "28": 720.0, "29": 720.0,
-        "30": 720.0, "31": 720.0, "32": 720.0, "33": 720.0, "34": 720.0, "35": 720.0,
-        "36": 720.0, "37": 720.0, "38": 720.0, "39": 720.0, "40": 720.0, "41": 720.0,
-        "42": 720.0, "43": 720.0, "44": 720.0, "45": 720.0, "46": 720.0, "47": 720.0,
-        "48": 720.0, "49": 720.0
-      }
-    },
-    "text_to_video": {
-      "notes": "Single-video generation using the default prompt",
-      "expected_e2e_ms": 95616.59,
-      "expected_avg_denoise_ms": 1798.77,
-      "expected_median_denoise_ms": 1786.78,
-
-      "stages_ms": {
-        "InputValidationStage": 1.03,
-        "TextEncodingStage": 3450.0,
-        "ConditioningStage": 1.0,
-        "TimestepPreparationStage": 6.0,
-        "LatentPreparationStage": 15.0,
-        "DenoisingStage": 90100.0,
-        "DecodingStage": 3650.0
-      },
-
-      "denoise_step_ms": {
-        "0": 3500.0, "10": 1800.0, "20": 1800.0, "29": 1800.0, "39": 1800.0, "49": 1800.0
-      },
-      "frames_per_second": 0.51,
-      "total_frames": 49,
-      "avg_frame_time_ms": 1951.36
-    },
-    "image_to_video": {
-      "notes": "Image-to-Video generation baseline placeholder: TODO(bug)",
-      "expected_e2e_ms": 1000000000.0,
-      "expected_avg_denoise_ms": 1000000000.0,
-      "expected_median_denoise_ms": 1000000000.0,
-      "stages_ms": {},
-      "denoise_step_ms": {},
-      "frames_per_second": null,
-      "total_frames": null,
-      "avg_frame_time_ms": null
-    },
-    "text_image_to_video": {
-      "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B",
-      "expected_e2e_ms": 178300.0,
-      "expected_avg_denoise_ms": 3250.0,
-      "expected_median_denoise_ms": 3260.0,
-      "stages_ms": {
-        "InputValidationStage": 80.0,
-        "TextEncodingStage": 3000.0,
-        "ConditioningStage": 1.0,
-        "TimestepPreparationStage": 6.0,
-        "LatentPreparationStage": 30.0,
-        "DenoisingStage": 162900.0,
-        "DecodingStage": 13500.0
-      },
-      "denoise_step_ms": {
-        "0": 3700.0,
-        "10": 3300.0,
-        "20": 3300.0,
-        "29": 3300.0,
-        "39": 3300.0,
-        "49": 3300.0
-      },
-      "frames_per_second": null,
-      "total_frames": null,
-      "avg_frame_time_ms": null
-    }
-  }
-}
+{
+  "metadata": {
+    "model": "Diffusion Server",
+    "hardware": "CI H100 80GB pool",
+    "description": "Reference numbers captured from the CI diffusion server baseline run"
+  },
+  "tolerances": {
+    "e2e": 0.25,
+    "stage": 0.3,
+    "denoise_step": 0.2,
+    "denoise_agg": 0.1
+  },
+  "sampling": {
+    "step_fractions": [
+      0.0,
+      0.2,
+      0.4,
+      0.6,
+      0.8,
+      1.0
+    ],
+    "warmup_requests": {
+      "text": 1,
+      "image_edit": 0
+    }
+  },
+  "scenarios": {
+    "text_to_image": {
+      "notes": "Single-image generation using the default prompt",
+      "expected_e2e_ms": 74500.0,
+      "expected_avg_denoise_ms": 422.42,
+      "expected_median_denoise_ms": 410.62,
+      "stages_ms": {
+        "InputValidationStage": 0.1,
+        "TextEncodingStage": 834.2,
+        "ConditioningStage": 0.1,
+        "TimestepPreparationStage": 10.6,
+        "LatentPreparationStage": 9.0,
+        "DenoisingStage": 21202.6,
+        "DecodingStage": 476.12
+      },
+      "denoise_step_ms": {
+        "0": 1077.77, "1": 345.13, "2": 413.8, "3": 405.49, "4": 408.14, "5": 409.06,
+        "6": 408.85, "7": 410.53, "8": 407.51, "9": 409.44, "10": 408.65, "11": 410.14,
+        "12": 411.74, "13": 409.59, "14": 409.17, "15": 410.78, "16": 410.66, "17": 410.58,
+        "18": 411.27, "19": 410.51, "20": 409.03, "21": 410.16, "22": 409.42, "23": 411.03,
+        "24": 410.18, "25": 409.72, "26": 410.26, "27": 410.21, "28": 410.71, "29": 410.76,
+        "30": 411.06, "31": 410.1, "32": 410.55, "33": 410.77, "34": 410.74, "35": 411.75,
+        "36": 410.78, "37": 411.56, "38": 410.85, "39": 411.08, "40": 411.12, "41": 411.1,
+        "42": 411.09, "43": 410.87, "44": 411.37, "45": 411.68, "46": 411.0, "47": 410.09,
+        "48": 412.72, "49": 410.42
+      }
+    },
+    "image_edit": {
+      "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit",
+      "expected_e2e_ms": 138500.0,
+      "expected_avg_denoise_ms": 720.0,
+      "expected_median_denoise_ms": 718.0,
+      "stages_ms": {
+        "InputValidationStage": 23,
+        "ImageEncodingStage": 1350.0,
+        "ImageVAEEncodingStage": 340.0,
+        "ConditioningStage": 0.13,
+        "TimestepPreparationStage": 13.78,
+        "LatentPreparationStage": 10.0,
+        "DenoisingStage": 36000.0,
+        "DecodingStage": 850.0
+      },
+      "denoise_step_ms": {
+        "0": 720.0, "1": 720.0, "2": 720.0, "3": 720.0, "4": 720.0, "5": 720.0,
+        "6": 720.0, "7": 720.0, "8": 720.0, "9": 720.0, "10": 720.0, "11": 720.0,
+        "12": 720.0, "13": 720.0, "14": 720.0, "15": 720.0, "16": 720.0, "17": 720.0,
+        "18": 720.0, "19": 720.0, "20": 720.0, "21": 720.0, "22": 720.0, "23": 720.0,
+        "24": 720.0, "25": 720.0, "26": 720.0, "27": 720.0, "28": 720.0, "29": 720.0,
+        "30": 720.0, "31": 720.0, "32": 720.0, "33": 720.0, "34": 720.0, "35": 720.0,
+        "36": 720.0, "37": 720.0, "38": 720.0, "39": 720.0, "40": 720.0, "41": 720.0,
+        "42": 720.0, "43": 720.0, "44": 720.0, "45": 720.0, "46": 720.0, "47": 720.0,
+        "48": 720.0, "49": 720.0
+      }
+    },
+    "text_to_video": {
+      "notes": "Single-video generation using the default prompt",
+      "expected_e2e_ms": 95616.59,
+      "expected_avg_denoise_ms": 1798.77,
+      "expected_median_denoise_ms": 1786.78,
+
+      "stages_ms": {
+        "InputValidationStage": 1.03,
+        "TextEncodingStage": 3450.0,
+        "ConditioningStage": 1.0,
+        "TimestepPreparationStage": 6.0,
+        "LatentPreparationStage": 15.0,
+        "DenoisingStage": 90100.0,
+        "DecodingStage": 3650.0
+      },
+
+      "denoise_step_ms": {
+        "0": 3500.0, "10": 1800.0, "20": 1800.0, "29": 1800.0, "39": 1800.0, "49": 1800.0
+      },
+      "frames_per_second": 0.51,
+      "total_frames": 49,
+      "avg_frame_time_ms": 1951.36
+    },
+    "image_to_video": {
+        "notes": "Wan-AI/Wan2.2-I2V-A14B",
+        "expected_e2e_ms": 282500.0,
+        "expected_avg_denoise_ms": 7000.0,
+        "expected_median_denoise_ms": 7000.19,
+        "stages_ms": {
+            "InputValidationStage": 20.0,
+            "TextEncodingStage": 2100.0,
+            "ConditioningStage": 2.0,
+            "TimestepPreparationStage": 2.0,
+            "LatentPreparationStage": 10.0,
+            "ImageVAEEncodingStage": 1800.0,
+            "DenoisingStage": 278000.0,
+            "DecodingStage": 2700.0
+        },
+        "denoise_step_ms": {
+            "0": 24000.0,
+            "8": 7000.0,
+            "16": 7000.0,
+            "23": 7000.0,
+            "31": 7000.0,
+            "39": 7000.0
+        }
+    },
+    "text_image_to_video": {
+      "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B",
+      "expected_e2e_ms": 178300.0,
+      "expected_avg_denoise_ms": 3250.0,
+      "expected_median_denoise_ms": 3260.0,
+      "stages_ms": {
+        "InputValidationStage": 80.0,
+        "TextEncodingStage": 3000.0,
+        "ConditioningStage": 1.0,
+        "TimestepPreparationStage": 6.0,
+        "LatentPreparationStage": 30.0,
+        "DenoisingStage": 162900.0,
+        "DecodingStage": 13500.0
+      },
+      "denoise_step_ms": {
+        "0": 3700.0,
+        "10": 3300.0,
+        "20": 3300.0,
+        "29": 3300.0,
+        "39": 3300.0,
+        "49": 3300.0
+      },
+      "frames_per_second": null,
+      "total_frames": null,
+      "avg_frame_time_ms": null
+    }
+  }
+}