sgl-project · mickqian · Nov 17, 2025 · Nov 16, 2025 · Nov 16, 2025 · Nov 16, 2025
@@ -322,9 +322,11 @@ jobs:
 
       - name: Clean Corrupted Hugging Face Model Cache
         run: |
-          echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download."
+          echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download. This is temporary"
           rm -rf /hf_home/hub/models--Qwen--Qwen-Image
           rm -rf /hf_home/hub/models--Qwen--Qwen-Image-Edit
+          rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-I2V-A14B-Diffusers
+          rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-TI2V-5B-Diffusers
 
       - name: Run diffusion server tests
         timeout-minutes: 60
@@ -1006,7 +1008,6 @@ jobs:
               exit 1
             fi
           done
-
           # If the loop completes, all jobs were successful
           echo "All jobs completed successfully"
           exit 0
@@ -37,7 +37,7 @@ class ModelTaskType(Enum):
     T2I = auto()  # Text to Image
     I2I = auto()  # Image to Image
 
-    def is_image_task(self):
+    def is_image_gen(self):
         return self == ModelTaskType.T2I or self == ModelTaskType.I2I
 
 
@@ -134,12 +134,15 @@ def postprocess_image(self, image):
     def slice_noise_pred(self, noise, latents):
         return noise
 
-    def set_width_and_height(self, width, height, image):
+    def adjust_size(self, width, height, image):
         """
         image: input image
         """
         return width, height
 
+    def adjust_num_frames(self, num_frames):
+        return num_frames
+
     # called in ImageEncodingStage, preprocess the image
     def preprocess_image(self, image, image_processor: VaeImageProcessor):
         return image

@@ -273,7 +273,7 @@ def preprocess_image(self, image, image_processor):
         image = image_processor.resize(image, calculated_height, calculated_width)
         return image
 
-    def set_width_and_height(self, width, height, image):
+    def adjust_size(self, width, height, image):
         image_size = image[0].size if isinstance(image, list) else image.size
         calculated_width, calculated_height, _ = calculate_dimensions(
             1024 * 1024, image_size[0] / image_size[1]

@@ -15,6 +15,9 @@
 )
 from sglang.multimodal_gen.configs.models.vaes import WanVAEConfig
 from sglang.multimodal_gen.configs.pipelines.base import ModelTaskType, PipelineConfig
+from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger
+
+logger = init_logger(__name__)
 
 
 def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tensor:
@@ -33,6 +36,22 @@ def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tenso
     return prompt_embeds_tensor
 
 
+@dataclass
+class WanI2VCommonConfig(PipelineConfig):
+    # for all wan i2v pipelines
+    def adjust_num_frames(self, num_frames):
+        vae_scale_factor_temporal = self.vae_config.arch_config.scale_factor_temporal
+        if num_frames % vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = (
+                num_frames // vae_scale_factor_temporal * vae_scale_factor_temporal + 1
+            )
+            return num_frames
+        return num_frames
+
+
 @dataclass
 class WanT2V480PConfig(PipelineConfig):
     """Base configuration for Wan T2V 1.3B pipeline architecture."""
@@ -81,7 +100,7 @@ class WanT2V720PConfig(WanT2V480PConfig):
 
 
 @dataclass
-class WanI2V480PConfig(WanT2V480PConfig):
+class WanI2V480PConfig(WanT2V480PConfig, WanI2VCommonConfig):
     """Base configuration for Wan I2V 14B 480P pipeline architecture."""
 
     # WanConfig-specific parameters with defaults
@@ -128,7 +147,7 @@ class FastWan2_1_T2V_480P_Config(WanT2V480PConfig):
 
 
 @dataclass
-class Wan2_2_TI2V_5B_Config(WanT2V480PConfig):
+class Wan2_2_TI2V_5B_Config(WanT2V480PConfig, WanI2VCommonConfig):
     flow_shift: float | None = 5.0
     task_type: ModelTaskType = ModelTaskType.TI2V
     expand_timesteps: bool = True

@@ -259,7 +259,7 @@ def generate(
         # TODO: simplify
         data_type = (
             DataType.IMAGE
-            if self.server_args.pipeline_config.task_type.is_image_task()
+            if self.server_args.pipeline_config.task_type.is_image_gen()
             or pretrained_sampling_params.num_frames == 1
             else DataType.VIDEO
         )

@@ -45,12 +45,12 @@ def prepare_sampling_params(
     # Validate dimensions
     if sampling_params.num_frames <= 0:
         raise ValueError(
-            f"Height, width, and num_frames must be positive integers, got "
+            f"height, width, and num_frames must be positive integers, got "
             f"height={sampling_params.height}, width={sampling_params.width}, "
             f"num_frames={sampling_params.num_frames}"
         )
 
-    if pipeline_config.task_type.is_image_task():
+    if pipeline_config.task_type.is_image_gen():
         # settle num_frames
         logger.debug(f"Setting num_frames to 1 because this is a image-gen model")
         sampling_params.num_frames = 1
@@ -104,6 +104,10 @@ def prepare_sampling_params(
             )
             sampling_params.num_frames = new_num_frames
 
+        sampling_params.num_frames = server_args.pipeline_config.adjust_num_frames(
+            sampling_params.num_frames
+        )
+
     sampling_params.set_output_file_ext()
     sampling_params.log(server_args=server_args)
     return sampling_params

@@ -235,7 +235,7 @@ def __post_init__(self):
 
     def set_width_and_height(self, server_args: ServerArgs):
         if self.height is None or self.width is None:
-            width, height = server_args.pipeline_config.set_width_and_height(
+            width, height = server_args.pipeline_config.adjust_size(
                 self.width, self.height, self.pil_image
             )
             self.width = width

@@ -123,7 +123,7 @@ def forward(
         if isinstance(server_args.pipeline_config, QwenImageEditPipelineConfig):
             height = None if batch.height_not_provided else batch.height
             width = None if batch.width_not_provided else batch.width
-            width, height = server_args.pipeline_config.set_width_and_height(
+            width, height = server_args.pipeline_config.adjust_size(
                 height, width, batch.pil_image
             )
             batch.width = width

@@ -803,7 +803,7 @@ def get_provided_args(
 
     def check_server_sp_args(self):
 
-        if self.pipeline_config.task_type.is_image_task():
+        if self.pipeline_config.task_type.is_image_gen():
             if (
                 (self.sp_degree and self.sp_degree > 1)
                 or (self.ulysses_degree and self.ulysses_degree > 1)

@@ -180,22 +180,22 @@ class PerformanceSummary:
         startup_grace_seconds=30.0,
         custom_validator="video",
     ),
-    # # === Image to Video (I2V) ===
-    # DiffusionCase(
-    #     id="wan2_1_i2v_480p",
-    #     model_path="Wan-AI/Wan2.1-I2V-14B-Diffusers",
-    #     scenario_name="image_to_video",
-    #     modality="video",
-    #     prompt="generate", # passing in something since failing if no prompt is passed
-    #     warmup_text=0, # warmups only for image gen models
-    #     warmup_edit=0,
-    #     output_size="1024x1536",
-    #     image_edit_prompt="generate",
-    #     image_edit_path="https://github.com/lm-sys/lm-sys.github.io/releases/download/test/TI2I_Qwen_Image_Edit_Input.jpg",
-    #     startup_grace_seconds=30.0,
-    #     custom_validator="video",
-    #     seconds=4,
-    # ),
+    # === Image to Video (I2V) ===
+    DiffusionCase(
+        id="wan2_2_i2v",
+        model_path="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+        scenario_name="image_to_video",
+        modality="video",
+        prompt="generate",  # passing in something since failing if no prompt is passed
+        warmup_text=0,  # warmups only for image gen models
+        warmup_edit=0,
+        output_size="832x1104",
+        image_edit_prompt="generate",
+        image_edit_path="https://github.com/Wan-Video/Wan2.2/blob/990af50de458c19590c245151197326e208d7191/examples/i2v_input.JPG?raw=true",
+        startup_grace_seconds=30.0,
+        custom_validator="video",
+        seconds=1,
+    ),
     # === Text and Image to Video (TI2V) ===
     DiffusionCase(
         id="wan2_2_ti2v_5b",