diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 33f53fc0e150..9eff3a9de59d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -322,9 +322,11 @@ jobs: - name: Clean Corrupted Hugging Face Model Cache run: | - echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download." + echo "Temp: Deleting potentially corrupted Qwen/Qwen-Image and Qwen/Qwen-Image-Edit cache to ensure a fresh download. This is temporary" rm -rf /hf_home/hub/models--Qwen--Qwen-Image rm -rf /hf_home/hub/models--Qwen--Qwen-Image-Edit + rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-I2V-A14B-Diffusers + rm -rf /hf_home/hub/models--Wan-AI--Wan2.2-TI2V-5B-Diffusers - name: Run diffusion server tests timeout-minutes: 60 @@ -1006,7 +1008,6 @@ jobs: exit 1 fi done - # If the loop completes, all jobs were successful echo "All jobs completed successfully" exit 0 diff --git a/python/sglang/multimodal_gen/configs/pipelines/base.py b/python/sglang/multimodal_gen/configs/pipelines/base.py index 8e6039bd7f1c..bbde1570ebde 100644 --- a/python/sglang/multimodal_gen/configs/pipelines/base.py +++ b/python/sglang/multimodal_gen/configs/pipelines/base.py @@ -37,7 +37,7 @@ class ModelTaskType(Enum): T2I = auto() # Text to Image I2I = auto() # Image to Image - def is_image_task(self): + def is_image_gen(self): return self == ModelTaskType.T2I or self == ModelTaskType.I2I @@ -134,12 +134,15 @@ def postprocess_image(self, image): def slice_noise_pred(self, noise, latents): return noise - def set_width_and_height(self, width, height, image): + def adjust_size(self, width, height, image): """ image: input image """ return width, height + def adjust_num_frames(self, num_frames): + return num_frames + # called in ImageEncodingStage, preprocess the image def preprocess_image(self, image, image_processor: VaeImageProcessor): return image diff --git a/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py b/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py index 48de3d41a19d..54c85bdcbdd9 100644 --- a/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py +++ b/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py @@ -273,7 +273,7 @@ def preprocess_image(self, image, image_processor): image = image_processor.resize(image, calculated_height, calculated_width) return image - def set_width_and_height(self, width, height, image): + def adjust_size(self, width, height, image): image_size = image[0].size if isinstance(image, list) else image.size calculated_width, calculated_height, _ = calculate_dimensions( 1024 * 1024, image_size[0] / image_size[1] diff --git a/python/sglang/multimodal_gen/configs/pipelines/wan.py b/python/sglang/multimodal_gen/configs/pipelines/wan.py index d5efa2d64f71..d7068fba7a05 100644 --- a/python/sglang/multimodal_gen/configs/pipelines/wan.py +++ b/python/sglang/multimodal_gen/configs/pipelines/wan.py @@ -15,6 +15,9 @@ ) from sglang.multimodal_gen.configs.models.vaes import WanVAEConfig from sglang.multimodal_gen.configs.pipelines.base import ModelTaskType, PipelineConfig +from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger + +logger = init_logger(__name__) def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tensor: @@ -33,6 +36,22 @@ def t5_postprocess_text(outputs: BaseEncoderOutput, _text_inputs) -> torch.Tenso return prompt_embeds_tensor +@dataclass +class WanI2VCommonConfig(PipelineConfig): + # for all wan i2v pipelines + def adjust_num_frames(self, num_frames): + vae_scale_factor_temporal = self.vae_config.arch_config.scale_factor_temporal + if num_frames % vae_scale_factor_temporal != 1: + logger.warning( + f"`num_frames - 1` has to be divisible by {vae_scale_factor_temporal}. Rounding to the nearest number." + ) + num_frames = ( + num_frames // vae_scale_factor_temporal * vae_scale_factor_temporal + 1 + ) + return num_frames + return num_frames + + @dataclass class WanT2V480PConfig(PipelineConfig): """Base configuration for Wan T2V 1.3B pipeline architecture.""" @@ -81,7 +100,7 @@ class WanT2V720PConfig(WanT2V480PConfig): @dataclass -class WanI2V480PConfig(WanT2V480PConfig): +class WanI2V480PConfig(WanT2V480PConfig, WanI2VCommonConfig): """Base configuration for Wan I2V 14B 480P pipeline architecture.""" # WanConfig-specific parameters with defaults @@ -128,7 +147,7 @@ class FastWan2_1_T2V_480P_Config(WanT2V480PConfig): @dataclass -class Wan2_2_TI2V_5B_Config(WanT2V480PConfig): +class Wan2_2_TI2V_5B_Config(WanT2V480PConfig, WanI2VCommonConfig): flow_shift: float | None = 5.0 task_type: ModelTaskType = ModelTaskType.TI2V expand_timesteps: bool = True diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py b/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py index 962a880ad452..1c376634a9c3 100644 --- a/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py +++ b/python/sglang/multimodal_gen/runtime/entrypoints/diffusion_generator.py @@ -259,7 +259,7 @@ def generate( # TODO: simplify data_type = ( DataType.IMAGE - if self.server_args.pipeline_config.task_type.is_image_task() + if self.server_args.pipeline_config.task_type.is_image_gen() or pretrained_sampling_params.num_frames == 1 else DataType.VIDEO ) diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/utils.py b/python/sglang/multimodal_gen/runtime/entrypoints/utils.py index 62e5c842d170..a4f8744697fb 100644 --- a/python/sglang/multimodal_gen/runtime/entrypoints/utils.py +++ b/python/sglang/multimodal_gen/runtime/entrypoints/utils.py @@ -45,12 +45,12 @@ def prepare_sampling_params( # Validate dimensions if sampling_params.num_frames <= 0: raise ValueError( - f"Height, width, and num_frames must be positive integers, got " + f"height, width, and num_frames must be positive integers, got " f"height={sampling_params.height}, width={sampling_params.width}, " f"num_frames={sampling_params.num_frames}" ) - if pipeline_config.task_type.is_image_task(): + if pipeline_config.task_type.is_image_gen(): # settle num_frames logger.debug(f"Setting num_frames to 1 because this is a image-gen model") sampling_params.num_frames = 1 @@ -104,6 +104,10 @@ def prepare_sampling_params( ) sampling_params.num_frames = new_num_frames + sampling_params.num_frames = server_args.pipeline_config.adjust_num_frames( + sampling_params.num_frames + ) + sampling_params.set_output_file_ext() sampling_params.log(server_args=server_args) return sampling_params diff --git a/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py b/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py index 06dded31bc3f..ef60d8f5ec7f 100644 --- a/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py +++ b/python/sglang/multimodal_gen/runtime/pipelines/schedule_batch.py @@ -235,7 +235,7 @@ def __post_init__(self): def set_width_and_height(self, server_args: ServerArgs): if self.height is None or self.width is None: - width, height = server_args.pipeline_config.set_width_and_height( + width, height = server_args.pipeline_config.adjust_size( self.width, self.height, self.pil_image ) self.width = width diff --git a/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py b/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py index 49a618055261..b992c7791b74 100644 --- a/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py +++ b/python/sglang/multimodal_gen/runtime/pipelines/stages/input_validation.py @@ -123,7 +123,7 @@ def forward( if isinstance(server_args.pipeline_config, QwenImageEditPipelineConfig): height = None if batch.height_not_provided else batch.height width = None if batch.width_not_provided else batch.width - width, height = server_args.pipeline_config.set_width_and_height( + width, height = server_args.pipeline_config.adjust_size( height, width, batch.pil_image ) batch.width = width diff --git a/python/sglang/multimodal_gen/runtime/server_args.py b/python/sglang/multimodal_gen/runtime/server_args.py index 43ae2aff6539..71de931d32b5 100644 --- a/python/sglang/multimodal_gen/runtime/server_args.py +++ b/python/sglang/multimodal_gen/runtime/server_args.py @@ -803,7 +803,7 @@ def get_provided_args( def check_server_sp_args(self): - if self.pipeline_config.task_type.is_image_task(): + if self.pipeline_config.task_type.is_image_gen(): if ( (self.sp_degree and self.sp_degree > 1) or (self.ulysses_degree and self.ulysses_degree > 1) diff --git a/python/sglang/multimodal_gen/test/server/diffusion_config.py b/python/sglang/multimodal_gen/test/server/diffusion_config.py index 16381c608897..11131a2979ee 100644 --- a/python/sglang/multimodal_gen/test/server/diffusion_config.py +++ b/python/sglang/multimodal_gen/test/server/diffusion_config.py @@ -180,22 +180,22 @@ class PerformanceSummary: startup_grace_seconds=30.0, custom_validator="video", ), - # # === Image to Video (I2V) === - # DiffusionCase( - # id="wan2_1_i2v_480p", - # model_path="Wan-AI/Wan2.1-I2V-14B-Diffusers", - # scenario_name="image_to_video", - # modality="video", - # prompt="generate", # passing in something since failing if no prompt is passed - # warmup_text=0, # warmups only for image gen models - # warmup_edit=0, - # output_size="1024x1536", - # image_edit_prompt="generate", - # image_edit_path="https://github.com/lm-sys/lm-sys.github.io/releases/download/test/TI2I_Qwen_Image_Edit_Input.jpg", - # startup_grace_seconds=30.0, - # custom_validator="video", - # seconds=4, - # ), + # === Image to Video (I2V) === + DiffusionCase( + id="wan2_2_i2v", + model_path="Wan-AI/Wan2.2-I2V-A14B-Diffusers", + scenario_name="image_to_video", + modality="video", + prompt="generate", # passing in something since failing if no prompt is passed + warmup_text=0, # warmups only for image gen models + warmup_edit=0, + output_size="832x1104", + image_edit_prompt="generate", + image_edit_path="https://github.com/Wan-Video/Wan2.2/blob/990af50de458c19590c245151197326e208d7191/examples/i2v_input.JPG?raw=true", + startup_grace_seconds=30.0, + custom_validator="video", + seconds=1, + ), # === Text and Image to Video (TI2V) === DiffusionCase( id="wan2_2_ti2v_5b", diff --git a/python/sglang/multimodal_gen/test/server/perf_baselines.json b/python/sglang/multimodal_gen/test/server/perf_baselines.json index 473343c9dab1..cfb0c5ccbaf3 100644 --- a/python/sglang/multimodal_gen/test/server/perf_baselines.json +++ b/python/sglang/multimodal_gen/test/server/perf_baselines.json @@ -1,142 +1,155 @@ -{ - "metadata": { - "model": "Diffusion Server", - "hardware": "CI H100 80GB pool", - "description": "Reference numbers captured from the CI diffusion server baseline run" - }, - "tolerances": { - "e2e": 0.25, - "stage": 0.3, - "denoise_step": 0.1, - "denoise_agg": 0.1 - }, - "sampling": { - "step_fractions": [ - 0.0, - 0.2, - 0.4, - 0.6, - 0.8, - 1.0 - ], - "warmup_requests": { - "text": 1, - "image_edit": 0 - } - }, - "scenarios": { - "text_to_image": { - "notes": "Single-image generation using the default prompt", - "expected_e2e_ms": 74500.0, - "expected_avg_denoise_ms": 422.42, - "expected_median_denoise_ms": 410.62, - "stages_ms": { - "InputValidationStage": 0.1, - "TextEncodingStage": 834.2, - "ConditioningStage": 0.1, - "TimestepPreparationStage": 10.6, - "LatentPreparationStage": 9.0, - "DenoisingStage": 21202.6, - "DecodingStage": 476.12 - }, - "denoise_step_ms": { - "0": 1077.77, "1": 345.13, "2": 413.8, "3": 405.49, "4": 408.14, "5": 409.06, - "6": 408.85, "7": 410.53, "8": 407.51, "9": 409.44, "10": 408.65, "11": 410.14, - "12": 411.74, "13": 409.59, "14": 409.17, "15": 410.78, "16": 410.66, "17": 410.58, - "18": 411.27, "19": 410.51, "20": 409.03, "21": 410.16, "22": 409.42, "23": 411.03, - "24": 410.18, "25": 409.72, "26": 410.26, "27": 410.21, "28": 410.71, "29": 410.76, - "30": 411.06, "31": 410.1, "32": 410.55, "33": 410.77, "34": 410.74, "35": 411.75, - "36": 410.78, "37": 411.56, "38": 410.85, "39": 411.08, "40": 411.12, "41": 411.1, - "42": 411.09, "43": 410.87, "44": 411.37, "45": 411.68, "46": 411.0, "47": 410.09, - "48": 412.72, "49": 410.42 - } - }, - "image_edit": { - "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit", - "expected_e2e_ms": 138500.0, - "expected_avg_denoise_ms": 720.0, - "expected_median_denoise_ms": 718.0, - "stages_ms": { - "InputValidationStage": 23, - "ImageEncodingStage": 990.0, - "ImageVAEEncodingStage": 340.0, - "ConditioningStage": 0.13, - "TimestepPreparationStage": 13.78, - "LatentPreparationStage": 10.0, - "DenoisingStage": 36000.0, - "DecodingStage": 645 - }, - "denoise_step_ms": { - "0": 720.0, "1": 720.0, "2": 720.0, "3": 720.0, "4": 720.0, "5": 720.0, - "6": 720.0, "7": 720.0, "8": 720.0, "9": 720.0, "10": 720.0, "11": 720.0, - "12": 720.0, "13": 720.0, "14": 720.0, "15": 720.0, "16": 720.0, "17": 720.0, - "18": 720.0, "19": 720.0, "20": 720.0, "21": 720.0, "22": 720.0, "23": 720.0, - "24": 720.0, "25": 720.0, "26": 720.0, "27": 720.0, "28": 720.0, "29": 720.0, - "30": 720.0, "31": 720.0, "32": 720.0, "33": 720.0, "34": 720.0, "35": 720.0, - "36": 720.0, "37": 720.0, "38": 720.0, "39": 720.0, "40": 720.0, "41": 720.0, - "42": 720.0, "43": 720.0, "44": 720.0, "45": 720.0, "46": 720.0, "47": 720.0, - "48": 720.0, "49": 720.0 - } - }, - "text_to_video": { - "notes": "Single-video generation using the default prompt", - "expected_e2e_ms": 95616.59, - "expected_avg_denoise_ms": 1798.77, - "expected_median_denoise_ms": 1786.78, - - "stages_ms": { - "InputValidationStage": 1.03, - "TextEncodingStage": 3450.0, - "ConditioningStage": 1.0, - "TimestepPreparationStage": 6.0, - "LatentPreparationStage": 15.0, - "DenoisingStage": 90100.0, - "DecodingStage": 3650.0 - }, - - "denoise_step_ms": { - "0": 3500.0, "10": 1800.0, "20": 1800.0, "29": 1800.0, "39": 1800.0, "49": 1800.0 - }, - "frames_per_second": 0.51, - "total_frames": 49, - "avg_frame_time_ms": 1951.36 - }, - "image_to_video": { - "notes": "Image-to-Video generation baseline placeholder: TODO(bug)", - "expected_e2e_ms": 1000000000.0, - "expected_avg_denoise_ms": 1000000000.0, - "expected_median_denoise_ms": 1000000000.0, - "stages_ms": {}, - "denoise_step_ms": {}, - "frames_per_second": null, - "total_frames": null, - "avg_frame_time_ms": null - }, - "text_image_to_video": { - "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B", - "expected_e2e_ms": 178300.0, - "expected_avg_denoise_ms": 3250.0, - "expected_median_denoise_ms": 3260.0, - "stages_ms": { - "InputValidationStage": 80.0, - "TextEncodingStage": 3000.0, - "ConditioningStage": 1.0, - "TimestepPreparationStage": 6.0, - "LatentPreparationStage": 30.0, - "DenoisingStage": 162900.0, - "DecodingStage": 13500.0 - }, - "denoise_step_ms": { - "0": 3700.0, - "10": 3300.0, - "20": 3300.0, - "29": 3300.0, - "39": 3300.0, - "49": 3300.0 - }, - "frames_per_second": null, - "total_frames": null, - "avg_frame_time_ms": null - } - } -} +{ + "metadata": { + "model": "Diffusion Server", + "hardware": "CI H100 80GB pool", + "description": "Reference numbers captured from the CI diffusion server baseline run" + }, + "tolerances": { + "e2e": 0.25, + "stage": 0.3, + "denoise_step": 0.2, + "denoise_agg": 0.1 + }, + "sampling": { + "step_fractions": [ + 0.0, + 0.2, + 0.4, + 0.6, + 0.8, + 1.0 + ], + "warmup_requests": { + "text": 1, + "image_edit": 0 + } + }, + "scenarios": { + "text_to_image": { + "notes": "Single-image generation using the default prompt", + "expected_e2e_ms": 74500.0, + "expected_avg_denoise_ms": 422.42, + "expected_median_denoise_ms": 410.62, + "stages_ms": { + "InputValidationStage": 0.1, + "TextEncodingStage": 834.2, + "ConditioningStage": 0.1, + "TimestepPreparationStage": 10.6, + "LatentPreparationStage": 9.0, + "DenoisingStage": 21202.6, + "DecodingStage": 476.12 + }, + "denoise_step_ms": { + "0": 1077.77, "1": 345.13, "2": 413.8, "3": 405.49, "4": 408.14, "5": 409.06, + "6": 408.85, "7": 410.53, "8": 407.51, "9": 409.44, "10": 408.65, "11": 410.14, + "12": 411.74, "13": 409.59, "14": 409.17, "15": 410.78, "16": 410.66, "17": 410.58, + "18": 411.27, "19": 410.51, "20": 409.03, "21": 410.16, "22": 409.42, "23": 411.03, + "24": 410.18, "25": 409.72, "26": 410.26, "27": 410.21, "28": 410.71, "29": 410.76, + "30": 411.06, "31": 410.1, "32": 410.55, "33": 410.77, "34": 410.74, "35": 411.75, + "36": 410.78, "37": 411.56, "38": 410.85, "39": 411.08, "40": 411.12, "41": 411.1, + "42": 411.09, "43": 410.87, "44": 411.37, "45": 411.68, "46": 411.0, "47": 410.09, + "48": 412.72, "49": 410.42 + } + }, + "image_edit": { + "notes": "single uploaded reference image, Qwen/Qwen-Image-Edit", + "expected_e2e_ms": 138500.0, + "expected_avg_denoise_ms": 720.0, + "expected_median_denoise_ms": 718.0, + "stages_ms": { + "InputValidationStage": 23, + "ImageEncodingStage": 1350.0, + "ImageVAEEncodingStage": 340.0, + "ConditioningStage": 0.13, + "TimestepPreparationStage": 13.78, + "LatentPreparationStage": 10.0, + "DenoisingStage": 36000.0, + "DecodingStage": 850.0 + }, + "denoise_step_ms": { + "0": 720.0, "1": 720.0, "2": 720.0, "3": 720.0, "4": 720.0, "5": 720.0, + "6": 720.0, "7": 720.0, "8": 720.0, "9": 720.0, "10": 720.0, "11": 720.0, + "12": 720.0, "13": 720.0, "14": 720.0, "15": 720.0, "16": 720.0, "17": 720.0, + "18": 720.0, "19": 720.0, "20": 720.0, "21": 720.0, "22": 720.0, "23": 720.0, + "24": 720.0, "25": 720.0, "26": 720.0, "27": 720.0, "28": 720.0, "29": 720.0, + "30": 720.0, "31": 720.0, "32": 720.0, "33": 720.0, "34": 720.0, "35": 720.0, + "36": 720.0, "37": 720.0, "38": 720.0, "39": 720.0, "40": 720.0, "41": 720.0, + "42": 720.0, "43": 720.0, "44": 720.0, "45": 720.0, "46": 720.0, "47": 720.0, + "48": 720.0, "49": 720.0 + } + }, + "text_to_video": { + "notes": "Single-video generation using the default prompt", + "expected_e2e_ms": 95616.59, + "expected_avg_denoise_ms": 1798.77, + "expected_median_denoise_ms": 1786.78, + + "stages_ms": { + "InputValidationStage": 1.03, + "TextEncodingStage": 3450.0, + "ConditioningStage": 1.0, + "TimestepPreparationStage": 6.0, + "LatentPreparationStage": 15.0, + "DenoisingStage": 90100.0, + "DecodingStage": 3650.0 + }, + + "denoise_step_ms": { + "0": 3500.0, "10": 1800.0, "20": 1800.0, "29": 1800.0, "39": 1800.0, "49": 1800.0 + }, + "frames_per_second": 0.51, + "total_frames": 49, + "avg_frame_time_ms": 1951.36 + }, + "image_to_video": { + "notes": "Wan-AI/Wan2.2-I2V-A14B", + "expected_e2e_ms": 282500.0, + "expected_avg_denoise_ms": 7000.0, + "expected_median_denoise_ms": 7000.19, + "stages_ms": { + "InputValidationStage": 20.0, + "TextEncodingStage": 2100.0, + "ConditioningStage": 2.0, + "TimestepPreparationStage": 2.0, + "LatentPreparationStage": 10.0, + "ImageVAEEncodingStage": 1800.0, + "DenoisingStage": 278000.0, + "DecodingStage": 2700.0 + }, + "denoise_step_ms": { + "0": 24000.0, + "8": 7000.0, + "16": 7000.0, + "23": 7000.0, + "31": 7000.0, + "39": 7000.0 + } + }, + "text_image_to_video": { + "notes": "Text-and-Image-to-Video generation baseline for Wan2.2-TI2V-5B", + "expected_e2e_ms": 178300.0, + "expected_avg_denoise_ms": 3250.0, + "expected_median_denoise_ms": 3260.0, + "stages_ms": { + "InputValidationStage": 80.0, + "TextEncodingStage": 3000.0, + "ConditioningStage": 1.0, + "TimestepPreparationStage": 6.0, + "LatentPreparationStage": 30.0, + "DenoisingStage": 162900.0, + "DecodingStage": 13500.0 + }, + "denoise_step_ms": { + "0": 3700.0, + "10": 3300.0, + "20": 3300.0, + "29": 3300.0, + "39": 3300.0, + "49": 3300.0 + }, + "frames_per_second": null, + "total_frames": null, + "avg_frame_time_ms": null + } + } +}