Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,11 @@ jobs:

- name: Clean Corrupted Hugging Face Model Cache
run: |
echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download."
echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download. This is temporary"
rm -rf /hf_home/hub/models--Qwen--Qwen-Image
rm -rf /hf_home/hub/models--Qwen--Qwen-Image-Edit
rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-I2V-A14B-Diffusers
rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-TI2V-5B-Diffusers

- name: Run diffusion server tests
timeout-minutes: 60
Expand Down Expand Up @@ -1006,7 +1008,6 @@ jobs:
exit 1
fi
done

# If the loop completes, all jobs were successful
echo "All jobs completed successfully"
exit 0
7 changes: 5 additions & 2 deletions python/sglang/multimodal_gen/configs/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class ModelTaskType(Enum):
T2I = auto() # Text to Image
I2I = auto() # Image to Image

def is_image_task(self):
def is_image_gen(self):
return self == ModelTaskType.T2I or self == ModelTaskType.I2I


Expand Down Expand Up @@ -134,12 +134,15 @@ def postprocess_image(self, image):
def slice_noise_pred(self, noise, latents):
return noise

def set_width_and_height(self, width, height, image):
def adjust_size(self, width, height, image):
"""
image: input image
"""
return width, height

def adjust_num_frames(self, num_frames):
return num_frames

# called in ImageEncodingStage, preprocess the image
def preprocess_image(self, image, image_processor: VaeImageProcessor):
return image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def preprocess_image(self, image, image_processor):
image = image_processor.resize(image, calculated_height, calculated_width)
return image

def set_width_and_height(self, width, height, image):
def adjust_size(self, width, height, image):
image_size = image[0].size if isinstance(image, list) else image.size
calculated_width, calculated_height, _ = calculate_dimensions(
1024 * 1024, image_size[0] / image_size[1]
Expand Down
23 changes: 21 additions & 2 deletions python/sglang/multimodal_gen/configs/pipelines/wan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
)
from sglang.multimodal_gen.configs.models.vaes import WanVAEConfig
from sglang.multimodal_gen.configs.pipelines.base import ModelTaskType, PipelineConfig
from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger

logger = init_logger(__name__)


def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tensor:
Expand All @@ -33,6 +36,22 @@ def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tenso
return prompt_embeds_tensor


@dataclass
class WanI2VCommonConfig(PipelineConfig):
# for all wan i2v pipelines
def adjust_num_frames(self, num_frames):
vae_scale_factor_temporal = self.vae_config.arch_config.scale_factor_temporal
if num_frames % vae_scale_factor_temporal != 1:
logger.warning(
f"`num_frames - 1` has to be divisible by {vae_scale_factor_temporal}. Rounding to the nearest number."
)
num_frames = (
num_frames // vae_scale_factor_temporal * vae_scale_factor_temporal + 1
)
return num_frames
return num_frames


@dataclass
class WanT2V480PConfig(PipelineConfig):
"""Base configuration for Wan T2V 1.3B pipeline architecture."""
Expand Down Expand Up @@ -81,7 +100,7 @@ class WanT2V720PConfig(WanT2V480PConfig):


@dataclass
class WanI2V480PConfig(WanT2V480PConfig):
class WanI2V480PConfig(WanT2V480PConfig, WanI2VCommonConfig):
"""Base configuration for Wan I2V 14B 480P pipeline architecture."""

# WanConfig-specific parameters with defaults
Expand Down Expand Up @@ -128,7 +147,7 @@ class FastWan2_1_T2V_480P_Config(WanT2V480PConfig):


@dataclass
class Wan2_2_TI2V_5B_Config(WanT2V480PConfig):
class Wan2_2_TI2V_5B_Config(WanT2V480PConfig, WanI2VCommonConfig):
flow_shift: float | None = 5.0
task_type: ModelTaskType = ModelTaskType.TI2V
expand_timesteps: bool = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def generate(
# TODO: simplify
data_type = (
DataType.IMAGE
if self.server_args.pipeline_config.task_type.is_image_task()
if self.server_args.pipeline_config.task_type.is_image_gen()
or pretrained_sampling_params.num_frames == 1
else DataType.VIDEO
)
Expand Down
8 changes: 6 additions & 2 deletions python/sglang/multimodal_gen/runtime/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ def prepare_sampling_params(
# Validate dimensions
if sampling_params.num_frames <= 0:
raise ValueError(
f"Height, width, and num_frames must be positive integers, got "
f"height, width, and num_frames must be positive integers, got "
f"height={sampling_params.height}, width={sampling_params.width}, "
f"num_frames={sampling_params.num_frames}"
)

if pipeline_config.task_type.is_image_task():
if pipeline_config.task_type.is_image_gen():
# settle num_frames
logger.debug(f"Setting num_frames to 1 because this is a image-gen model")
sampling_params.num_frames = 1
Expand Down Expand Up @@ -104,6 +104,10 @@ def prepare_sampling_params(
)
sampling_params.num_frames = new_num_frames

sampling_params.num_frames = server_args.pipeline_config.adjust_num_frames(
sampling_params.num_frames
)

sampling_params.set_output_file_ext()
sampling_params.log(server_args=server_args)
return sampling_params
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __post_init__(self):

def set_width_and_height(self, server_args: ServerArgs):
if self.height is None or self.width is None:
width, height = server_args.pipeline_config.set_width_and_height(
width, height = server_args.pipeline_config.adjust_size(
self.width, self.height, self.pil_image
)
self.width = width
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def forward(
if isinstance(server_args.pipeline_config, QwenImageEditPipelineConfig):
height = None if batch.height_not_provided else batch.height
width = None if batch.width_not_provided else batch.width
width, height = server_args.pipeline_config.set_width_and_height(
width, height = server_args.pipeline_config.adjust_size(
height, width, batch.pil_image
)
batch.width = width
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/multimodal_gen/runtime/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ def get_provided_args(

def check_server_sp_args(self):

if self.pipeline_config.task_type.is_image_task():
if self.pipeline_config.task_type.is_image_gen():
if (
(self.sp_degree and self.sp_degree > 1)
or (self.ulysses_degree and self.ulysses_degree > 1)
Expand Down
32 changes: 16 additions & 16 deletions python/sglang/multimodal_gen/test/server/diffusion_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,22 +180,22 @@ class PerformanceSummary:
startup_grace_seconds=30.0,
custom_validator="video",
),
# # === Image to Video (I2V) ===
# DiffusionCase(
# id="wan2_1_i2v_480p",
# model_path="Wan-AI/Wan2.1-I2V-14B-Diffusers",
# scenario_name="image_to_video",
# modality="video",
# prompt="generate", # passing in something since failing if no prompt is passed
# warmup_text=0, # warmups only for image gen models
# warmup_edit=0,
# output_size="1024x1536",
# image_edit_prompt="generate",
# image_edit_path="https://github.com/lm-sys/lm-sys.github.io/releases/download/test/TI2I_Qwen_Image_Edit_Input.jpg",
# startup_grace_seconds=30.0,
# custom_validator="video",
# seconds=4,
# ),
# === Image to Video (I2V) ===
DiffusionCase(
id="wan2_2_i2v",
model_path="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
scenario_name="image_to_video",
modality="video",
prompt="generate", # passing in something since failing if no prompt is passed
warmup_text=0, # warmups only for image gen models
warmup_edit=0,
output_size="832x1104",
image_edit_prompt="generate",
image_edit_path="https://github.com/Wan-Video/Wan2.2/blob/990af50de458c19590c245151197326e208d7191/examples/i2v_input.JPG?raw=true",
startup_grace_seconds=30.0,
custom_validator="video",
seconds=1,
),
# === Text and Image to Video (TI2V) ===
DiffusionCase(
id="wan2_2_ti2v_5b",
Expand Down
Loading
Loading