diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index 235b710a68..4796a17692 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -33,6 +33,7 @@ This folder provides several entrypoints for experimenting with text-to-image di | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 | | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 | | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) | +| `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3) | 160 | !!! info *Peak VRAM: based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU. @@ -90,6 +91,8 @@ python text_to_image.py \ | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models | | `--lora-path` | str | — | Path to PEFT LoRA adapter folder | | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights | +| `--use-system-prompt` | str | `None` | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text. Recommended: `en_unified`. Only for HunyuanImage-3.0.| +| `--system-prompt` | str | `None` | Custom system prompt text. Only used when `--use-system-prompt` is set to `custom`. Only for HunyuanImage-3.0.| **NextStep-1.1 specific arguments:** diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 927b0f0b08..42e44abb89 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -242,6 +242,19 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable logging of diffusion pipeline stats.", ) + parser.add_argument( + "--use-system-prompt", + type=str, + default=None, + choices=["None", "dynamic", "en_vanilla", "en_recaption", "en_think_recaption", "en_unified", "custom"], + help="System prompt preset for generation. Recommended: en_unified.", + ) + parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=("Custom system prompt. Used when --use-system-prompt is custom. "), + ) return parser.parse_args() @@ -382,13 +395,13 @@ def main(): ) generation_start = time.perf_counter() - extra_args = { "timesteps_shift": args.timesteps_shift, "cfg_schedule": args.cfg_schedule, "use_norm": args.use_norm, + "use_system_prompt": args.use_system_prompt, + "system_prompt": args.system_prompt, } - if lora_request: extra_args["lora_request"] = lora_request extra_args["lora_scale"] = args.lora_scale diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md index 87b6a56438..17d377ea3e 100644 --- a/examples/online_serving/text_to_image/README.md +++ b/examples/online_serving/text_to_image/README.md @@ -231,6 +231,8 @@ count, use `size` and `n` rather than `height`, `width`, or | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | +| `use_system_prompt` | str | None | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text string. Only for HunyuanImage-3.0. | +| `system_prompt` | str | None | Custom system prompt text. Only used when `use_system_prompt` is set to `custom`. Only for HunyuanImage-3.0. | ## Response Format diff --git a/examples/online_serving/text_to_image/openai_chat_client.py b/examples/online_serving/text_to_image/openai_chat_client.py index 828827aba2..f3c43086a1 100644 --- a/examples/online_serving/text_to_image/openai_chat_client.py +++ b/examples/online_serving/text_to_image/openai_chat_client.py @@ -28,6 +28,8 @@ def generate_image( lora_name: str | None = None, lora_scale: float | None = None, lora_int_id: int | None = None, + use_system_prompt: str | None = None, + system_prompt: str | None = None, ) -> bytes | None: """Generate an image using the images generation API. @@ -45,6 +47,8 @@ def generate_image( lora_name: LoRA name (optional, defaults to path stem) lora_scale: LoRA scale factor (default: 1.0) lora_int_id: LoRA integer ID (optional, derived from path if not provided) + use_system_prompt: System prompt for generation. + system_prompt: Custom system prompt. Returns: Image bytes or None if failed @@ -70,7 +74,10 @@ def generate_image( payload["negative_prompt"] = negative_prompt if seed is not None: payload["seed"] = seed - + if use_system_prompt is not None: + payload["use_system_prompt"] = use_system_prompt + if system_prompt is not None: + payload["system_prompt"] = system_prompt # Add LoRA if provided if lora_path: lora_body: dict = { @@ -128,9 +135,21 @@ def main(): default=None, help="LoRA integer id (cache key). If omitted, the server derives a stable id from lora_path.", ) - + parser.add_argument( + "--use-system-prompt", + type=str, + default=None, + help=( + "System prompt for generation. Use predefined types: 'en_unified', 'en_vanilla', 'en_recaption', 'en_think_recaption', 'dynamic', or 'None'; Or provide custom text string directly. Recommended en_unified. " + ), + ) + parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=("Custom system prompt. Used when --use-system-prompt is custom. "), + ) args = parser.parse_args() - print(f"Generating image for: {args.prompt}") image_bytes = generate_image( @@ -146,6 +165,8 @@ def main(): lora_name=args.lora_name, lora_scale=args.lora_scale if args.lora_path else None, lora_int_id=args.lora_int_id if args.lora_path else None, + use_system_prompt=args.use_system_prompt, + system_prompt=args.system_prompt, ) if image_bytes: diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py new file mode 100644 index 0000000000..5522f33eaa --- /dev/null +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -0,0 +1,347 @@ +# ruff: noqa: E501 +from collections.abc import Generator +from pathlib import Path + +import pytest +import torch +import torch.nn.functional as F +from PIL import Image +from transformers import CLIPModel, CLIPProcessor + +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.platforms import current_omni_platform + +PROMPT = "A brown and white dog is running on the grass" +MODEL_NAME = "tencent/HunyuanImage-3.0" +LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" +REPO_ROOT = Path(__file__).resolve().parents[3] +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image_3_moe.yaml" + +pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] + +# System prompt type. Options: None, dynamic, en_vanilla, en_recaption, en_think_recaption, en_unified +# Below are the CLIP embedding tensors from the official HunyuanImage model (seed=1234, prompt: "A brown and white dog is running on the grass"). +# SEED_1234 denotes the output without system prompt, while the remaining entries correspond to outputs generated with different system prompts. +# fmt: off +SEED_1234 = torch.tensor( + [ + 0.027797, 0.028964, -0.005051, 0.001059, 0.017021, -0.034029, 0.021989, 0.033318, -0.000308, 0.016179, 0.010504, -0.034201, 0.050230, -0.021170, 0.083530, -0.003621, + 0.040758, 0.039913, 0.044305, -0.019285, -0.058387, -0.001099, 0.042782, -0.036136, -0.014955, 0.002147, 0.009439, 0.012943, -0.028732, -0.018349, 0.002861, 0.013019, + 0.014362, -0.038833, 0.029413, 0.020724, 0.002714, 0.010416, -0.020527, 0.050266, -0.081026, -0.006814, -0.007457, -0.032333, 0.008417, -0.122455, -0.006085, -0.025610, + 0.012614, 0.025817, -0.005419, 0.038657, 0.000789, 0.067111, 0.002818, 0.028696, 0.047305, -0.009993, -0.019508, 0.038604, 0.099657, 0.026728, 0.012361, 0.013626, + 0.023164, -0.037186, 0.007535, 0.054645, -0.009012, -0.019383, -0.005234, -0.018715, -0.000346, 0.051317, -0.028744, 0.029933, -0.006382, -0.018414, -0.033906, -0.028892, + -0.015301, -0.004276, 0.014626, -0.008505, 0.013717, -0.027323, -0.001332, -0.040227, 0.047021, -0.019082, -0.037260, -0.029780, -0.594026, 0.016573, -0.010523, 0.042616, + -0.013136, 0.030540, -0.151685, -0.005367, 0.016209, -0.034183, 0.009852, 0.038452, 0.005494, -0.017887, -0.007167, 0.017262, -0.038980, 0.011995, 0.021952, -0.031660, + 0.020507, -0.035880, 0.035183, -0.026975, -0.050788, -0.002553, 0.037774, -0.020082, -0.015403, 0.045022, 0.072167, -0.029237, 0.003895, -0.051250, 0.008581, 0.023545, + -0.026827, 0.020895, 0.041780, -0.040766, -0.008146, 0.080630, 0.000404, 0.032003, -0.005279, -0.090707, -0.013813, 0.010204, -0.001513, 0.016394, -0.001321, 0.020535, + -0.038645, 0.024858, 0.024378, 0.018717, -0.056314, 0.024402, 0.018694, 0.029009, -0.008502, -0.014694, -0.028345, 0.005202, 0.046116, -0.032166, -0.030706, -0.038738, + -0.031356, -0.009683, 0.040069, 0.001596, -0.012621, 0.018590, -0.024138, 0.035330, 0.011546, 0.015791, -0.026932, 0.004531, 0.022455, -0.012871, 0.013915, -0.009567, + -0.010976, 0.013497, 0.042590, 0.002072, -0.052718, -0.045494, 0.013036, -0.005403, -0.005947, -0.003437, 0.016653, -0.016805, -0.040291, 0.007927, 0.001296, -0.008319, + 0.021514, -0.001452, -0.121998, 0.015396, -0.022594, -0.006977, -0.040108, -0.035550, -0.021872, -0.014721, 0.019799, 0.036556, 0.015072, -0.057988, -0.011684, -0.045220, + -0.026295, 0.052647, 0.013741, -0.013428, 0.061794, 0.021431, -0.011316, -0.009963, 0.008198, 0.027746, 0.074219, -0.019499, 0.042673, 0.016028, 0.007214, -0.010650, + -0.019682, 0.001902, 0.038867, -0.007333, 0.031749, 0.004391, 0.018688, 0.044654, 0.030615, -0.027816, 0.031711, -0.056952, -0.033499, -0.039368, 0.025801, -0.027610, + -0.009329, -0.001799, 0.024061, -0.012593, -0.050266, -0.012512, 0.019528, -0.083434, 0.018238, 0.034138, -0.020120, -0.009910, -0.002280, 0.035325, 0.034440, -0.055205, + -0.017698, -0.000439, -0.034703, 0.013356, -0.037287, 0.048494, -0.018570, 0.028069, 0.019269, -0.007263, -0.008521, 0.000426, -0.016677, 0.056162, -0.011944, 0.017322, + 0.022219, -0.014266, -0.009292, -0.009979, 0.014973, 0.011623, -0.017799, 0.032925, -0.024668, 0.007312, -0.025035, -0.008967, -0.026827, 0.011889, -0.138517, -0.009608, + -0.020592, -0.001272, 0.015676, -0.025706, 0.031775, -0.004195, 0.026876, -0.014748, -0.025966, -0.008741, 0.035437, 0.017139, -0.005140, -0.007101, -0.012510, -0.023600, + 0.032969, -0.005510, 0.020010, 0.032567, 0.015558, 0.004265, -0.036300, 0.048210, 0.080424, -0.052820, -0.002063, -0.020875, 0.052530, -0.001638, -0.020299, -0.035202, + 0.087818, 0.034614, -0.032735, 0.033201, -0.001751, 0.029574, 0.009926, 0.011619, -0.001267, -0.020149, -0.003826, -0.029860, 0.011437, -0.051276, 0.024344, 0.003096, + -0.011573, 0.038228, -0.005730, -0.052328, 0.001909, -0.025877, 0.019976, -0.010160, 0.023892, 0.049161, -0.028978, 0.018700, -0.026460, 0.001090, -0.072128, -0.008406, + 0.010828, 0.020621, -0.005706, 0.023797, 0.036231, -0.112069, 0.017601, 0.007496, 0.045999, 0.016771, 0.021977, 0.022305, 0.018377, 0.002036, -0.029815, -0.082922, + -0.012710, -0.026355, 0.003790, 0.017472, -0.023148, -0.002901, -0.057854, 0.028393, 0.230866, -0.023486, 0.051094, 0.047508, 0.018957, -0.037130, 0.001054, -0.026126, + 0.021970, -0.046915, -0.019419, -0.014077, 0.002502, -0.079454, -0.057149, -0.081701, 0.041979, -0.043074, -0.009425, -0.035776, -0.021794, -0.004826, -0.057263, -0.072940, + 0.037651, -0.013991, -0.043863, -0.020581, 0.034319, -0.052566, -0.010355, -0.022963, 0.027144, -0.017339, 0.088930, -0.000670, -0.026547, -0.026586, -0.032531, 0.040314, + 0.010148, 0.021104, 0.009228, -0.073227, 0.036650, -0.019337, 0.010211, -0.089620, -0.024676, -0.020729, -0.004070, 0.000784, -0.110561, 0.015390, 0.027151, -0.003228, + -0.066704, -0.004797, -0.026117, -0.018131, -0.090114, 0.020659, -0.007157, 0.013608, -0.022324, 0.027487, 0.018873, 0.027854, 0.045085, -0.039992, -0.017829, 0.011071, + -0.011393, -0.004454, -0.037189, -0.030299, 0.059668, 0.005064, 0.024655, -0.037239, 0.046882, -0.010356, -0.009690, 0.061909, -0.024736, 0.016849, 0.000784, 0.000201, + 0.066165, 0.010234, -0.012134, -0.002823, -0.060847, 0.008953, 0.010348, 0.022292, -0.044602, -0.020981, 0.038839, 0.006616, -0.016836, -0.043995, -0.005463, -0.036413, + 0.034895, -0.018008, -0.009543, -0.025080, -0.035243, 0.042696, -0.028911, -0.030676, -0.038542, -0.027798, -0.026607, 0.019467, 0.070629, -0.037356, -0.042648, -0.000284, + 0.033095, 0.077781, -0.052930, 0.022515, -0.029926, -0.033821, -0.003277, -0.000038, -0.026871, 0.018223, -0.004221, 0.023454, -0.030611, -0.006396, -0.009873, -0.008402, + ], + dtype=torch.float32, +) +SYSTEM_PROMPT_DYNAMIC = torch.tensor( + [ + 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, + 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, + -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, + 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, + 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, + -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, + -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, + 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, + -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, + -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, + -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, + -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, + 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, + -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, + -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, + -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, + -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, + 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, + -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, + 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, + 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, + 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, + 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, + -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, + 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, + 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, + 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, + -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, + -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, + 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, + 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, + 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, + ], + dtype=torch.float32, +) +SYSTEM_EN_RECAPTION = torch.tensor( + [ + 0.007721, 0.015421, -0.019305, -0.000920, 0.016031, -0.019730, 0.029683, 0.026810, -0.010510, 0.021463, 0.008833, -0.040851, 0.043260, -0.007042, 0.057224, 0.011995, + 0.007818, 0.046369, 0.059838, -0.028548, -0.047399, -0.000983, 0.024343, -0.052259, -0.013638, 0.006856, 0.009186, 0.014235, -0.031497, -0.008644, -0.009349, 0.018900, + 0.002913, -0.022475, 0.039518, 0.019052, -0.007600, 0.010634, -0.011830, 0.075675, -0.071738, -0.014947, 0.004995, -0.025804, -0.002553, -0.093262, 0.002881, -0.033744, + -0.007234, 0.013659, 0.009897, 0.039185, -0.005366, 0.041534, -0.005924, 0.019786, 0.048566, -0.009356, -0.027360, 0.042557, 0.091286, 0.009286, 0.015410, 0.028166, + 0.022476, -0.025162, 0.012144, 0.084603, -0.003150, -0.008549, -0.002099, -0.014987, -0.019480, 0.046843, -0.030613, 0.015557, -0.008965, -0.008798, -0.027032, -0.014112, + 0.018703, -0.014749, -0.000928, -0.024660, 0.024004, 0.004560, 0.028156, -0.028467, 0.025444, -0.038699, -0.014927, -0.031593, -0.648498, 0.018529, 0.003378, 0.030188, + -0.002314, 0.014950, -0.146615, -0.009005, 0.016579, -0.037867, 0.020907, 0.033160, 0.007877, -0.026345, -0.056428, 0.031255, -0.018404, 0.013334, 0.009988, -0.022790, + 0.020803, -0.036862, 0.036222, -0.006646, -0.058084, -0.012036, 0.044199, -0.027665, -0.015779, 0.051554, 0.059970, -0.025977, 0.003967, -0.035247, -0.000488, 0.023182, + 0.000468, 0.019190, 0.047268, -0.032279, -0.005302, 0.078669, -0.001915, 0.024918, -0.014952, -0.078905, -0.018333, 0.001362, -0.015115, 0.005435, 0.002313, 0.018766, + -0.032773, 0.037344, 0.024061, 0.012143, -0.057106, 0.029490, 0.019537, 0.009099, 0.026064, -0.015927, -0.037047, 0.006002, 0.025191, -0.035318, -0.032245, -0.047822, + -0.023568, -0.004533, 0.025100, 0.002758, -0.002649, -0.012287, -0.012139, 0.043080, 0.003295, 0.024667, -0.021050, 0.006752, 0.025315, -0.011127, 0.009800, -0.021343, + -0.024866, 0.010098, 0.026954, 0.012467, -0.035866, -0.031780, 0.007479, -0.003388, -0.012619, -0.012099, 0.014974, -0.001908, -0.032700, 0.004703, 0.003238, -0.007498, + 0.023241, 0.002715, -0.111739, 0.003317, 0.006475, -0.019792, -0.046558, -0.032593, -0.020762, -0.005059, 0.016934, 0.029195, 0.028744, -0.050633, 0.001907, -0.028791, + -0.016695, 0.052143, 0.010439, 0.007204, 0.028502, 0.012607, -0.012414, -0.031238, 0.007305, 0.032309, 0.087924, -0.010530, 0.029925, 0.032666, -0.002202, 0.017539, + -0.009091, -0.001631, 0.024906, -0.013102, 0.031772, 0.018465, 0.012035, 0.031460, 0.030193, 0.005289, 0.025859, -0.038971, -0.046577, -0.025852, 0.035235, -0.038514, + 0.001042, 0.013012, 0.023701, -0.014630, -0.029269, -0.011981, 0.008219, -0.067347, -0.003456, 0.028198, -0.008657, -0.017773, 0.010540, 0.023964, 0.021012, -0.034465, + -0.023748, 0.004065, -0.021598, 0.008440, -0.031533, 0.038390, -0.007680, -0.003852, 0.016136, -0.017906, -0.008927, 0.006300, -0.001251, 0.029337, -0.008632, 0.020568, + 0.021560, -0.007222, 0.005313, -0.013089, 0.012299, 0.031303, -0.013951, 0.016547, -0.024771, -0.008753, -0.030908, -0.014421, -0.017656, 0.014044, -0.114986, 0.000956, + -0.035588, 0.003756, 0.015383, -0.013358, 0.009385, -0.001359, 0.012623, -0.028724, 0.001607, 0.012809, 0.032668, 0.011834, -0.015587, -0.007170, -0.021344, -0.019664, + 0.017690, -0.014538, 0.016511, 0.038037, 0.029919, 0.020907, -0.018565, 0.032964, 0.078548, -0.050386, -0.003012, -0.016965, 0.064131, 0.008077, -0.025879, -0.035820, + 0.095075, 0.019901, -0.019114, 0.022832, 0.003741, 0.027148, 0.018231, 0.027741, 0.020328, 0.001700, -0.006939, -0.024154, 0.018523, -0.029819, 0.008050, -0.004477, + 0.006087, 0.056878, -0.009083, -0.061537, -0.011531, -0.037551, 0.000434, -0.005843, 0.024739, 0.032020, -0.053119, 0.020704, -0.012385, -0.002726, -0.082489, 0.009072, + 0.013341, 0.000316, 0.001899, 0.022868, 0.034407, -0.066857, 0.020589, 0.012195, 0.023211, -0.001520, 0.000897, 0.029670, -0.015930, 0.006509, -0.035172, -0.061215, + -0.014099, -0.038584, -0.012213, 0.018613, -0.012365, -0.002777, -0.055184, 0.017146, 0.214358, -0.015750, 0.052488, 0.045205, 0.025334, -0.054615, 0.002117, -0.038122, + 0.012402, -0.053418, -0.025405, 0.007235, 0.013208, -0.092481, -0.048700, -0.085186, 0.029039, -0.036767, -0.000777, -0.017625, -0.012556, -0.004887, -0.033660, -0.082310, + 0.013387, -0.003256, -0.062981, -0.019886, 0.017624, -0.037421, -0.020743, -0.020894, 0.041974, -0.008502, 0.088413, -0.018697, -0.029398, -0.029389, -0.043721, 0.013872, + 0.003944, 0.030361, 0.005355, -0.081355, 0.041843, -0.016395, 0.011954, -0.060440, -0.000966, -0.019101, 0.006803, -0.011310, -0.148581, 0.020342, 0.012795, -0.016473, + -0.053300, -0.012340, -0.016640, -0.029834, -0.082405, 0.011859, -0.004255, -0.004396, -0.012515, 0.031962, 0.030438, 0.013792, 0.031557, -0.047200, 0.006485, 0.024815, + -0.019376, -0.011454, -0.034184, -0.021329, 0.050115, 0.021720, 0.002874, -0.047163, 0.044031, -0.014663, 0.020534, 0.056017, 0.007017, 0.003323, 0.005734, -0.002777, + 0.082836, 0.012048, -0.023236, -0.007401, -0.071598, 0.016760, 0.017282, 0.028306, -0.026220, -0.008016, -0.000202, -0.020271, -0.019828, -0.046986, -0.005805, -0.039647, + 0.042879, -0.004463, 0.007753, -0.028916, -0.020612, 0.028833, -0.039839, -0.052447, -0.013275, -0.002407, -0.018937, 0.033216, 0.075535, -0.045026, -0.009901, 0.016637, + -0.000322, 0.073925, -0.055701, 0.014912, -0.045671, -0.021189, 0.006761, -0.002015, -0.027410, 0.018250, -0.015916, 0.016254, -0.044964, 0.029261, -0.029319, -0.005222, + ], + dtype=torch.float32, +) +SYSTEM_EN_THINK_RECAPTION = torch.tensor( + [ + 0.011004, 0.017341, -0.019959, -0.018314, 0.016520, -0.027395, 0.017946, 0.039665, 0.000645, 0.035903, 0.002499, -0.045664, 0.039472, -0.013479, 0.081302, 0.000182, + 0.006947, 0.042845, 0.059741, -0.010796, -0.035240, 0.004176, 0.029557, -0.043467, -0.017271, 0.006896, 0.010997, 0.022498, -0.023308, -0.013046, -0.000742, 0.016209, + -0.007152, -0.029868, 0.028747, 0.033743, -0.000227, 0.018419, -0.015023, 0.050376, -0.098475, -0.002375, 0.007897, -0.023936, 0.007843, -0.122463, -0.011680, -0.027267, + -0.007270, 0.021869, -0.011415, 0.043770, 0.000551, 0.048573, 0.003132, 0.014233, 0.037080, -0.004818, -0.028738, 0.044468, 0.073843, 0.016947, 0.014484, 0.021931, + 0.020110, -0.032309, -0.003811, 0.095704, -0.006950, -0.007237, -0.005529, -0.020573, -0.016259, 0.041909, -0.038748, 0.018029, 0.005066, -0.021186, -0.020102, -0.019719, + 0.006239, -0.021284, 0.004213, -0.024963, 0.032345, -0.012557, 0.037268, -0.038075, 0.040998, -0.032766, -0.023509, -0.016426, -0.627412, 0.022675, 0.000101, 0.023162, + -0.002081, 0.015922, -0.138671, -0.027995, 0.011579, -0.042859, 0.019935, 0.038077, 0.012640, -0.017377, -0.027456, 0.035151, -0.015756, 0.018530, 0.004646, -0.002589, + 0.019645, -0.043736, 0.034947, -0.010166, -0.061165, -0.019195, 0.028909, -0.019415, -0.009485, 0.049566, 0.068621, -0.038644, 0.011278, -0.036133, 0.000564, 0.022611, + -0.013612, 0.020854, 0.030614, -0.025578, 0.005673, 0.076526, -0.004887, 0.027769, -0.022605, -0.092657, -0.013218, 0.008081, -0.015227, 0.018031, -0.005145, 0.015028, + -0.027193, 0.034767, 0.028710, 0.032007, -0.053175, 0.033528, 0.019437, 0.011517, 0.012107, -0.027679, -0.026937, 0.008612, 0.036909, -0.051484, -0.039971, -0.034372, + -0.023825, -0.003025, 0.033648, -0.001852, 0.007309, 0.000714, -0.001075, 0.038534, 0.007586, 0.016213, -0.025223, -0.001099, 0.015852, -0.011477, 0.020635, -0.010696, + -0.019634, 0.025613, 0.034374, 0.007169, -0.035000, -0.032268, 0.015114, -0.014217, -0.005229, -0.005495, 0.018189, -0.011360, -0.026755, 0.007036, -0.002333, -0.001174, + 0.014729, 0.001739, -0.108591, 0.004699, 0.002048, -0.014801, -0.042855, -0.028846, -0.009609, -0.004500, 0.019466, 0.021848, 0.022140, -0.063035, -0.004272, -0.030798, + -0.018452, 0.055169, 0.012240, -0.003555, 0.038293, 0.008503, -0.016608, -0.021309, 0.000690, 0.027093, 0.088054, -0.008881, 0.034087, 0.030647, 0.003284, 0.005038, + -0.008359, 0.006311, 0.032462, -0.009699, 0.035283, 0.015261, 0.012827, 0.038169, 0.033959, -0.018048, 0.018122, -0.025259, -0.040084, -0.030879, 0.019853, -0.042558, + -0.011938, 0.019602, 0.016537, -0.003378, -0.027890, -0.014909, -0.005464, -0.071862, 0.012335, 0.021899, -0.017008, -0.023228, 0.003263, 0.004571, 0.016447, -0.029446, + -0.022645, -0.001261, -0.018573, 0.007431, -0.027587, 0.035362, -0.006785, -0.000614, 0.026044, -0.009056, -0.009843, 0.010467, -0.011929, 0.042025, -0.014068, 0.023113, + 0.023880, 0.014948, 0.004370, -0.005262, 0.012587, 0.021608, -0.001783, 0.023697, -0.024945, -0.011533, -0.020953, -0.007205, -0.024693, 0.012961, -0.168760, 0.001767, + -0.041265, -0.007044, 0.015021, -0.008407, 0.029642, -0.000956, 0.008607, -0.035365, -0.012187, 0.011744, 0.032612, 0.006226, -0.015891, -0.017747, -0.022565, -0.024505, + 0.031279, 0.004188, 0.011939, 0.038032, 0.008798, 0.012314, -0.024830, 0.034484, 0.076395, -0.060108, 0.001019, -0.016138, 0.067729, 0.003899, -0.029845, -0.019960, + 0.086663, 0.040965, -0.010458, 0.027808, -0.006394, 0.017343, 0.014788, 0.024756, 0.016446, -0.012537, -0.008406, -0.028109, 0.013369, -0.033571, 0.012170, -0.002199, + 0.005263, 0.052280, -0.018171, -0.047898, -0.010087, -0.038632, 0.006773, -0.000838, 0.011197, 0.038187, -0.049525, 0.021689, -0.007385, -0.005987, -0.094551, 0.019019, + 0.012760, 0.009617, -0.002262, 0.030228, 0.047823, -0.079764, 0.023391, -0.005561, 0.018866, 0.012817, 0.020878, 0.027037, -0.013905, -0.002874, -0.035522, -0.046266, + -0.032448, -0.036010, -0.007776, 0.016512, -0.012279, -0.005665, -0.057974, 0.016967, 0.202836, -0.009066, 0.066093, 0.045689, 0.018319, -0.048465, 0.000242, -0.040874, + 0.027824, -0.049045, -0.015616, -0.000307, 0.009163, -0.072975, -0.042979, -0.082254, 0.040549, -0.027049, 0.000725, -0.034118, -0.019604, -0.019097, -0.042483, -0.075446, + 0.019387, -0.005218, -0.053573, -0.029975, 0.008195, -0.036608, -0.018920, -0.025610, 0.028426, -0.002688, 0.074996, -0.003423, -0.032505, -0.030565, -0.028142, 0.014437, + 0.013359, 0.019376, 0.008356, -0.069731, 0.031824, -0.011103, 0.019327, -0.117090, -0.009352, -0.010290, -0.002129, -0.009198, -0.172915, 0.021232, 0.017274, -0.030060, + -0.061449, -0.006598, -0.013069, -0.012857, -0.081220, 0.019058, -0.004841, 0.003066, -0.037741, 0.041806, 0.018281, 0.009458, 0.036761, -0.044987, 0.003557, 0.008890, + -0.008011, -0.004063, -0.013474, -0.022090, 0.055398, 0.037475, 0.006991, -0.035962, 0.045503, -0.017162, 0.022391, 0.052754, -0.005924, -0.005936, 0.012673, -0.017922, + 0.084548, 0.014695, -0.013817, 0.000421, -0.065167, 0.018269, 0.023317, 0.023523, -0.034229, -0.019588, 0.007911, -0.002426, -0.017109, -0.050870, 0.002848, -0.033077, + 0.043451, -0.010609, -0.000375, -0.023206, -0.018155, 0.027102, -0.036006, -0.035115, -0.023922, 0.005989, -0.015372, 0.027123, 0.075210, -0.035302, -0.029799, 0.003642, + 0.007714, 0.063498, -0.053234, 0.015699, -0.040459, -0.027354, -0.002433, 0.010923, -0.020134, 0.029292, -0.010176, 0.013508, -0.032403, 0.004323, -0.017504, -0.015237, + ], + dtype=torch.float32, +) +SYSTEM_EN_VANILLA = torch.tensor( + [ + 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, + 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, + -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, + 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, + 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, + -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, + -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, + 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, + -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, + -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, + -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, + -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, + 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, + -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, + -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, + -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, + -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, + 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, + -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, + 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, + 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, + 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, + 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, + -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, + 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, + 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, + 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, + -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, + -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, + 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, + 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, + 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, + ], + dtype=torch.float32, +) +SYSTEM_EN_UNIFIED = torch.tensor( + [ + 0.011409, 0.014191, -0.023163, -0.020119, 0.019190, -0.029559, 0.019616, 0.035872, 0.010434, 0.028709, 0.011616, -0.039422, 0.038369, -0.004631, 0.081177, 0.007400, + 0.008903, 0.040408, 0.055323, -0.011950, -0.026940, 0.004916, 0.028101, -0.046200, -0.016732, 0.005115, 0.012100, 0.016136, -0.026057, -0.013827, -0.004914, 0.015261, + -0.010824, -0.028188, 0.022934, 0.026204, -0.003855, 0.013797, -0.014518, 0.050289, -0.100077, -0.002962, 0.009050, -0.028205, 0.016294, -0.128956, -0.012730, -0.023647, + -0.009306, 0.020066, 0.000033, 0.043619, 0.003250, 0.053425, 0.005889, 0.021529, 0.036032, -0.003254, -0.029715, 0.048345, 0.077978, 0.010674, 0.019296, 0.018721, + 0.019244, -0.040115, -0.004245, 0.085214, -0.005280, -0.010746, -0.000164, -0.023405, -0.015641, 0.040193, -0.038735, 0.018966, -0.004031, -0.017879, -0.023017, -0.030379, + 0.006468, -0.015959, 0.000532, -0.026530, 0.042640, -0.006095, 0.037899, -0.043658, 0.040965, -0.034682, -0.023729, -0.019291, -0.630840, 0.029658, 0.005462, 0.026650, + -0.000292, 0.013954, -0.149594, -0.019405, 0.015321, -0.045104, 0.030332, 0.031727, 0.012349, -0.009553, -0.022371, 0.034043, -0.014838, 0.015398, -0.003657, 0.000477, + 0.021084, -0.041406, 0.029946, -0.013832, -0.057358, -0.018086, 0.031598, -0.031835, -0.006697, 0.040866, 0.068602, -0.042203, 0.007362, -0.036959, 0.003794, 0.026533, + -0.011873, 0.017343, 0.028333, -0.021804, 0.004007, 0.075133, 0.003340, 0.025326, -0.015068, -0.092280, -0.011514, 0.006827, -0.008254, 0.021181, -0.005035, 0.022263, + -0.022443, 0.043919, 0.026637, 0.028568, -0.056881, 0.036740, 0.024430, 0.015891, 0.012257, -0.031126, -0.030108, 0.007229, 0.026998, -0.051685, -0.033003, -0.031170, + -0.024021, 0.004235, 0.030164, 0.002674, 0.008018, 0.005532, 0.001621, 0.044790, 0.006413, 0.027160, -0.015022, 0.000911, 0.019723, -0.016244, 0.020077, -0.006847, + -0.014110, 0.022461, 0.031656, 0.002760, -0.039078, -0.026893, 0.006628, -0.011775, -0.000240, -0.005908, 0.014943, -0.012131, -0.021755, 0.004732, -0.005297, -0.002922, + 0.014631, -0.002010, -0.112400, 0.000842, -0.002732, -0.014861, -0.052099, -0.034167, -0.011613, -0.006101, 0.013278, 0.018867, 0.026530, -0.068150, -0.003306, -0.032801, + -0.018523, 0.050875, 0.005488, -0.007241, 0.045707, 0.023119, -0.021519, -0.022683, 0.004806, 0.024827, 0.091371, -0.014424, 0.043836, 0.033094, 0.002390, 0.005450, + -0.004893, 0.013608, 0.031272, -0.002449, 0.031607, 0.014646, 0.014146, 0.043995, 0.028826, -0.012219, 0.021008, -0.020911, -0.036967, -0.036256, 0.013328, -0.038382, + -0.012084, 0.018183, 0.018782, -0.004697, -0.024284, -0.015474, -0.001463, -0.076015, 0.013923, 0.022125, -0.018765, -0.010793, 0.008409, 0.002067, 0.017961, -0.029716, + -0.020915, -0.001779, -0.009217, -0.001933, -0.036081, 0.042577, 0.000118, -0.013920, 0.014901, -0.016486, -0.010278, -0.000449, -0.017234, 0.042453, -0.009893, 0.021087, + 0.017671, 0.009861, -0.004210, 0.004944, 0.015627, 0.014370, -0.001128, 0.030247, -0.019552, -0.014017, -0.020859, -0.002614, -0.024405, 0.016532, -0.173204, -0.001196, + -0.037415, -0.010990, 0.010449, -0.006124, 0.019211, 0.003695, 0.011679, -0.031852, -0.009764, 0.005773, 0.035793, 0.003455, -0.011772, -0.020532, -0.027434, -0.024761, + 0.027483, -0.001554, 0.010411, 0.037888, 0.015619, 0.019186, -0.021204, 0.038158, 0.074991, -0.064521, -0.002503, -0.014499, 0.068165, 0.006145, -0.032891, -0.021540, + 0.091385, 0.047584, -0.009590, 0.028004, -0.002962, 0.021061, 0.014854, 0.025840, 0.016068, -0.014364, -0.016418, -0.033454, 0.011734, -0.036518, 0.013015, -0.003966, + 0.000855, 0.051373, -0.010960, -0.047078, -0.011048, -0.042015, 0.006818, 0.005483, 0.010251, 0.034951, -0.046162, 0.021258, -0.013397, -0.005259, -0.093775, 0.019974, + 0.014992, 0.004043, -0.005931, 0.035662, 0.050723, -0.083293, 0.028047, -0.008042, 0.020763, 0.016763, 0.022913, 0.027129, -0.014314, -0.009854, -0.039019, -0.044870, + -0.028101, -0.038026, -0.006294, 0.018265, -0.015425, -0.007866, -0.052784, 0.010470, 0.200260, -0.007798, 0.064482, 0.046612, 0.025353, -0.059695, -0.001831, -0.039643, + 0.025148, -0.042752, -0.014928, -0.010216, 0.014195, -0.069149, -0.041424, -0.078360, 0.036999, -0.021357, 0.011032, -0.026564, -0.016214, -0.023440, -0.044723, -0.064498, + 0.018283, -0.007165, -0.051802, -0.026299, 0.005867, -0.034691, -0.020621, -0.030512, 0.024458, -0.011330, 0.066558, -0.004069, -0.031624, -0.030639, -0.037451, 0.013079, + 0.015152, 0.008058, 0.009223, -0.069514, 0.030702, -0.009681, 0.014826, -0.115441, -0.005514, -0.011925, 0.001046, -0.007148, -0.164128, 0.018043, 0.017001, -0.026352, + -0.049691, -0.011637, -0.013045, -0.014851, -0.079469, 0.017692, -0.006575, 0.001063, -0.028299, 0.038777, 0.019930, 0.010641, 0.036955, -0.039004, -0.006477, 0.004278, + -0.001006, -0.002514, -0.017242, -0.023927, 0.049113, 0.038393, 0.011633, -0.031537, 0.041725, -0.012146, 0.023445, 0.049999, -0.008538, 0.001319, 0.012732, -0.021170, + 0.082096, 0.009610, -0.025717, 0.002566, -0.060849, 0.017403, 0.032650, 0.018658, -0.030629, -0.025032, 0.005555, 0.000522, -0.009667, -0.043099, 0.005939, -0.027156, + 0.045634, -0.011986, 0.002713, -0.032225, -0.015494, 0.028734, -0.036528, -0.033101, -0.027174, 0.009490, -0.016537, 0.029435, 0.065709, -0.037711, -0.020497, -0.005578, + 0.011768, 0.061035, -0.044676, 0.016113, -0.042945, -0.022579, 0.002430, 0.012474, -0.018198, 0.030468, -0.016646, 0.019020, -0.035804, 0.001175, -0.018312, -0.010760, + ], + dtype=torch.float32, +) +# fmt: on +SYSTEM_PROMPT_CASES = [ + pytest.param("none", None, SEED_1234, id="none"), + pytest.param("dynamic", "dynamic", SYSTEM_PROMPT_DYNAMIC, id="dynamic"), + pytest.param("en_vanilla", "en_vanilla", SYSTEM_EN_VANILLA, id="en_vanilla"), + pytest.param("en_recaption", "en_recaption", SYSTEM_EN_RECAPTION, id="en_recaption"), + pytest.param("en_think_recaption", "en_think_recaption", SYSTEM_EN_THINK_RECAPTION, id="en_think_recaption"), + pytest.param("en_unified", "en_unified", SYSTEM_EN_UNIFIED, id="en_unified"), +] + + +@pytest.fixture(scope="session") +def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: + try: + model = CLIPModel.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) + processor = CLIPProcessor.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) + except OSError as exc: + pytest.skip(f"Could not load CLIP model from local cache ({LOCAL_CLIP_PATH}): {exc}") + + model.eval() + return model, processor + + +@pytest.fixture(scope="module") +def omni() -> Generator[Omni, None, None]: + engine = Omni( + model=MODEL_NAME, + stage_configs_path=str(STAGE_CONFIG_PATH), + stage_init_timeout=600, + init_timeout=900, + ) + try: + yield engine + finally: + engine.close() + + +def _extract_generated_image(outputs: list[object]) -> Image.Image: + if not outputs: + raise AssertionError("No outputs were returned from Omni.generate()") + + first_output = outputs[0] + if images := getattr(first_output, "images", None): + return images[0] + + request_output = getattr(first_output, "request_output", None) + if request_output is not None and (images := getattr(request_output, "images", None)): + return images[0] + + raise AssertionError("No generated image found in Omni output") + + +def extract_embedding(image: Image.Image, clip_model: CLIPModel, clip_processor: CLIPProcessor) -> torch.Tensor: + inputs = clip_processor(images=image.convert("RGB"), return_tensors="pt") + with torch.inference_mode(): + features = clip_model.get_image_features(**inputs) + features = F.normalize(features, p=2, dim=-1) + return features.squeeze(0) + + +def compare_semantic( + expected_embedding: torch.Tensor, + image: Image.Image, + clip_model: CLIPModel, + clip_processor: CLIPProcessor, +) -> float: + features = extract_embedding(image, clip_model, clip_processor) + expected = F.normalize(expected_embedding, p=2, dim=-1) + return torch.dot(expected, features).item() + + +def _generate_image(omni: Omni, use_system_prompt: str | None) -> Image.Image: + generator_device = current_omni_platform.device_type or "cuda" + sampling_params = OmniDiffusionSamplingParams( + seed=1234, + generator=torch.Generator(device=generator_device).manual_seed(1234), + num_outputs_per_prompt=1, + ) + if use_system_prompt is not None: + sampling_params.extra_args = {"use_system_prompt": use_system_prompt} + + outputs = omni.generate({"prompt": PROMPT}, sampling_params) + return _extract_generated_image(outputs) + + +@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="Need at least 8 CUDA GPUs for this test.") +@pytest.mark.parametrize("system_prompt_name,use_system_prompt,expected_embedding", SYSTEM_PROMPT_CASES) +def test_system_prompt_scores( + omni: Omni, + clip_bundle: tuple[CLIPModel, CLIPProcessor], + system_prompt_name: str, + use_system_prompt: str | None, + expected_embedding: torch.Tensor, +) -> None: + clip_model, clip_processor = clip_bundle + generated_image = _generate_image(omni, use_system_prompt) + score = compare_semantic(expected_embedding, generated_image, clip_model, clip_processor) + + print(f"{system_prompt_name}: CLIP cosine similarity = {score:.6f}") diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py index ba24818dc9..7e9e2d2787 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py @@ -41,6 +41,7 @@ build_batch_2d_rope, real_batched_index_select, ) +from .system_prompt import get_system_prompt logger = logging.getLogger(__name__) @@ -991,10 +992,15 @@ def forward( width: int = 1024, num_inference_steps: int = 50, guidance_scale: float = 5.0, - system_prompt: str | None = None, generator: torch.Generator | list[torch.Generator] | None = None, **kwargs, ) -> DiffusionOutput: + extra_args = getattr(getattr(req, "sampling_params", None), "extra_args", {}) or {} + use_system_prompt = extra_args.get("use_system_prompt") + system_prompt = extra_args.get("system_prompt") + if use_system_prompt is not None: + system_prompt = get_system_prompt(use_system_prompt, "image", system_prompt) + system_prompt = system_prompt.strip() if system_prompt is not None else "" prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt generator = req.sampling_params.generator or generator height = req.sampling_params.height or height diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py b/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py new file mode 100644 index 0000000000..29494fad41 --- /dev/null +++ b/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py @@ -0,0 +1,215 @@ +# ruff: noqa: E501 +# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +t2i_system_prompt_en_vanilla = """ +You are an advanced AI text-to-image generation system. Given a detailed text prompt, your task is to create a high-quality, visually compelling image that accurately represents the described scene, characters, or objects. Pay careful attention to style, color, lighting, perspective, and any specific instructions provided. +""" + +# 775 +t2i_system_prompt_en_recaption = """ +You are a world-class image generation prompt expert. Your task is to rewrite a user's simple description into a **structured, objective, and detail-rich** professional-level prompt. + +The final output must be wrapped in `` tags. + +### **Universal Core Principles** + +When rewriting the prompt (inside the `` tags), you must adhere to the following principles: + +1. **Absolute Objectivity**: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad". Convey aesthetic qualities through specific descriptions of color, light, shadow, and composition. +2. **Physical and Logical Consistency**: All scene elements (e.g., gravity, light, shadows, reflections, spatial relationships, object proportions) must strictly adhere to real-world physics and common sense. For example, tennis players must be on opposite sides of the net; objects cannot float without a cause. +3. **Structured Description**: Strictly follow a logical order: from general to specific, background to foreground, and primary to secondary elements. Use directional terms like "foreground," "mid-ground," "background," and "left side of the frame" to clearly define the spatial layout. +4. **Use Present Tense**: Describe the scene from an observer's perspective using the present tense, such as "A man stands..." or "Light shines on..." +5. **Use Rich and Specific Descriptive Language**: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects, subjects, and text. Vague expressions are strictly prohibited. + +If the user specifies a style (e.g., oil painting, anime, UI design, text rendering), strictly adhere to that style. Otherwise, first infer a suitable style from the user's input. If there is no clear stylistic preference, default to an **ultra-realistic photographic style**. Then, generate the detailed rewritten prompt according to the **Style-Specific Creation Guide** below: + +### **Style-Specific Creation Guide** + +Based on the determined artistic style, apply the corresponding professional knowledge. + +**1. Photography and Realism Style** +* Utilize professional photography terms (e.g., lighting, lens, composition) and meticulously detail material textures, physical attributes of subjects, and environmental details. + +**2. Illustration and Painting Style** +* Clearly specify the artistic school (e.g., Japanese Cel Shading, Impasto Oil Painting) and focus on describing its unique medium characteristics, such as line quality, brushstroke texture, or paint properties. + +**3. Graphic/UI/APP Design Style** +* Objectively describe the final product, clearly defining the layout, elements, and color palette. All text on the interface must be enclosed in double quotes `""` to specify its exact content (e.g., "Login"). Vague descriptions are strictly forbidden. + +**4. Typographic Art** +* The text must be described as a complete physical object. The description must begin with the text itself. Use a straightforward front-on or top-down perspective to ensure the entire text is visible without cropping. + +### **Final Output Requirements** + +1. **Output the Final Prompt Only**: Do not show any thought process, Markdown formatting, or line breaks. +2. **Adhere to the Input**: You must retain the core concepts, attributes, and any specified text from the user's input. +3. **Style Reinforcement**: Mention the core style 3-5 times within the prompt and conclude with a style declaration sentence. +4. **Avoid Self-Reference**: Describe the image content directly. Remove redundant phrases like "This image shows..." or "The scene depicts..." +5. **The final output must be wrapped in `xxxx` tags.** + +The user will now provide an input prompt. You will provide the expanded prompt. +""" + +# 890 +t2i_system_prompt_en_think_recaption = """ +You will act as a top-tier Text-to-Image AI. Your core task is to deeply analyze the user's text input and transform it into a detailed, artistic, and fully user-intent-compliant image. + +Your workflow is divided into two phases: + +1. Thinking Phase (): In the tag, you need to conduct a structured thinking process, progressively breaking down and enriching the constituent elements of the image. This process must include, but is not limited to, the following dimensions: + +Subject: Clearly define the core character(s) or object(s) in the scene, including their appearance, posture, expression, and emotion. +Composition: Set the camera angle and layout, such as close-up, long shot, bird's-eye view, golden ratio composition, etc. +Environment/Background: Describe the scene where the subject is located, including the location, time of day, weather, and other elements in the background. +Lighting: Define the type, direction, and quality of the light source, such as soft afternoon sunlight, cool tones of neon lights, dramatic Rembrandt lighting, etc., to create a specific atmosphere. +Color Palette: Set the main color tone and color scheme of the image, such as vibrant and saturated, low-saturation Morandi colors, black and white, etc. +Quality/Style: Determine the artistic style and technical details of the image. This includes user-specified styles (e.g., anime, oil painting) or the default realistic style, as well as camera parameters (e.g., focal length, aperture, depth of field). +Details: Add minute elements that enhance the realism and narrative quality of the image, such as a character's accessories, the texture of a surface, dust particles in the air, etc. + + +2. Recaption Phase (): In the tag, merge all the key details from the thinking process into a coherent, precise, and visually evocative final description. This description is the direct instruction for generating the image, so it must be clear, unambiguous, and organized in a way that is most suitable for an image generation engine to understand. + +Absolutely Objective: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad." Convey aesthetic sense through concrete descriptions of colors, light, shadow, and composition. + +Physical and Logical Consistency: All scene elements (e.g., gravity, light and shadow, reflections, spatial relationships, object proportions) must strictly adhere to the physical laws of the real world and common sense. For example, in a tennis match, players must be on opposite sides of the net; objects cannot float without reason. + +Structured Description: Strictly follow a logical order: from whole to part, background to foreground, and primary to secondary. Use directional words like "foreground," "mid-ground," "background," "left side of the frame" to clearly define the spatial layout. + +Use Present Tense: Describe from an observer's perspective using the present tense, such as "a man stands," "light shines on..." +Use Rich and Specific Descriptive Language: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects/characters/text. Absolutely avoid any vague expressions. + + +Output Format: +Thinking processRefined image descriptionGenerate Image + + +You must strictly adhere to the following rules: + +1. Faithful to Intent, Reasonable Expansion: You can creatively add details to the user's description to enhance the image's realism and artistic quality. However, all additions must be highly consistent with the user's core intent and never introduce irrelevant or conflicting elements. +2. Style Handling: When the user does not specify a style, you must default to an "Ultra-realistic, Photorealistic" style. If the user explicitly specifies a style (e.g., anime, watercolor, oil painting, cyberpunk, etc.), both your thinking process and final description must strictly follow and reflect that specified style. +3. Text Rendering: If specific text needs to appear in the image (such as words on a sign, a book title), you must enclose this text in English double quotes (""). Descriptive text must not use double quotes. +4. Design-related Images: You need to specify all text and graphical elements that appear in the image and clearly describe their design details, including font, color, size, position, arrangement, visual effects, etc. +""" + +t2i_system_prompts = { + "en_vanilla": [t2i_system_prompt_en_vanilla], + "en_recaption": [t2i_system_prompt_en_recaption], + "en_think_recaption": [t2i_system_prompt_en_think_recaption], +} + + +unified_system_prompt_en = """You are an advanced multimodal model whose core mission is to analyze user intent and generate high-quality text and images. + +#### Four Core Capabilities +1. **Text-to-Text (T2T):** Generate coherent text responses from text prompts. +2. **Text-to-Image (T2I):** Generate high-quality images from text prompts. +3. **Text & Image to Text (TI2T):** Generate accurate text responses based on a combination of images and text. +4. **Text & Image to Image (TI2I):** Generate modified images based on a reference image and editing instructions. + +--- +### Image Generation Protocol (for T2I & TI2I) +You will operate in one of two modes, determined by the user's starting tag: +#### ** Mode (Prompt Rewriting)**: +* **Trigger:** Input begins with ``. +* **Task:** Immediately rewrite the user's text into a structured, objective, and detail-rich professional-grade prompt. +* **Output:** Output only the rewritten prompt within `` tags: `Rewritten professional-grade prompt` + +#### ** Mode (Think + Rewrite)**: +* **Trigger:** Input begins with ``. +* **Task:** First, conduct a structured analysis of the request within `` tags. Then, output the professional prompt, rewritten based on the analysis, within `` tags. +* **Output:** Strictly adhere to the format: `Analysis processRewritten prompt` + +--- +### Execution Standards and Guidelines +#### **`` Phase: Analysis Guidelines** +**For T2I (New Image Generation):** +Deconstruct the user's request into the following core visual components: +* **Subject:** Key features of the main character/object, including appearance, pose, expression, and emotion. +* **Composition:** Camera angle, lens type, and layout. +* **Environment/Background:** The setting, time of day, weather, and background elements. +* **Lighting:** Technical details such as light source type, direction, and quality. +* **Color Palette:** The dominant hues and overall color scheme. +* **Style/Quality:** The artistic style, clarity, depth of field, and other technical details. +* **Text:** Identify any text to be rendered in the image, including its content, style, and position. +* **Details:** Small elements that add narrative depth and realism. + +**For TI2I (Image Editing):** +Adopt a task-diagnostic approach: +1. **Diagnose Task:** Identify the edit type and analyze key requirements. +2. **Prioritize Analysis:** + * **Adding:** Analyze the new element's position and appearance, ensuring seamless integration with the original image's lighting, shadows, and style. + * **Removing:** Identify the target for removal and determine how to logically fill the resulting space using surrounding textures and lighting. + * **Modifying:** Analyze what to change and what it should become, while emphasizing which elements must remain unchanged. + * **Style Transfer:** Deconstruct the target style into specific features (e.g., brushstrokes, color palette) and apply them to the original image. + * **Text Editing:** Ensure correct content and format. Consider the text's visual style (e.g., font, color, material) and how it adapts to the surface's perspective, curvature, and lighting. + * **Reference Editing:** Extract specific visual elements (e.g., appearance, posture, composition, lines, depth) from the reference image to generate an image that aligns with the text description while also incorporating the referenced content. + * **Inferential Editing:** Identify vague requests (e.g., "make it more professional") and translate them into concrete visual descriptions. + +#### `` Phase: Professional-Grade Prompt Generation Rules +**General Rewriting Principles (for T2I & TI2I):** +1. **Structure & Logic:** Start with a global description. Use positional words (e.g., "foreground", "background") to define the layout. +2. **Absolute Objectivity:** Avoid subjective terms. Convey aesthetics through precise descriptions of color, light, shadow, and materials. +3. **Physical & Logical Consistency:** Ensure all descriptions adhere to the laws of physics and common sense. +4. **Fidelity to User Intent:** Preserve the user's core concepts, subjects, and attributes. Text to be rendered in the image **must be enclosed in double quotes ("")**. +5. **Camera & Resolution:** Translate camera parameters into descriptions of visual effects. Convert resolution information into natural language. + +**T2I-Specific Guidelines:** +* **Style Adherence & Inference:** Strictly follow the specified style. If none is given, infer the most appropriate style and detail it using professional terminology. +* **Style Detailing:** + * **Photography/Realism:** Use professional photography terms to describe lighting, lens effects, and material textures. + * **Painting/Illustration:** Specify the art movement or medium's characteristics. + * **UI/Design:** Objectively describe the final product. Define layout, elements, and typography. Text content must be specific and unambiguous. + +**TI2I-Specific Guidelines:** +* **Preserve Unchanged Elements:** Emphasize elements that **remain unchanged**. Unless explicitly instructed, never alter a character's identity/appearance, the core background, camera angle, or overall style. +* **Clear Editing Instructions:** + * **Replacement:** Use the logic "**replace B with A**," and provide a detailed description of A. + * **Addition:** Clearly state what to add, where, and what it looks like. +* **Unambiguous Referencing:** Avoid vague references (e.g., "that person"). Use specific descriptions of appearance. +""" + + +def get_system_prompt(sys_type, bot_task, system_prompt=None): + # No system prompt, return None directly + if sys_type == "None": + return None + # Use the unified English system prompt (combined T2I and TI2I guidelines) + elif sys_type == "en_unified": + return unified_system_prompt_en + # Use predefined English system prompts: vanilla (basic), recaption, think_recaption + elif sys_type in ["en_vanilla", "en_recaption", "en_think_recaption"]: + return t2i_system_prompts[sys_type][0] + # Dynamic mode: automatically select system prompt based on bot_task type + elif sys_type == "dynamic": + # Think task: use chain-of-thought recaption prompt + if bot_task == "think": + return t2i_system_prompts["en_think_recaption"][0] + # Recaption task: use recaption prompt + elif bot_task == "recaption": + return t2i_system_prompts["en_recaption"][0] + # Image generation task: use vanilla prompt + elif bot_task == "image": + return t2i_system_prompts["en_vanilla"][0].strip("\n") + # Other tasks: use user-provided custom prompt + else: + return system_prompt + # Custom mode: use the user-provided system_prompt parameter directly + elif sys_type == "custom": + return system_prompt + # Unsupported type: raise NotImplementedError + else: + raise NotImplementedError(f"Unsupported system prompt type: {sys_type}") + + +__all__ = ["get_system_prompt"] diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 0ffe33abde..e9287f5682 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1284,7 +1284,13 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) if request.negative_prompt is not None: prompt["negative_prompt"] = request.negative_prompt gen_params = OmniDiffusionSamplingParams(num_outputs_per_prompt=request.n) - + extra_args = {} + if request.use_system_prompt is not None: + extra_args["use_system_prompt"] = request.use_system_prompt + if request.system_prompt is not None: + extra_args["system_prompt"] = request.system_prompt + if extra_args: + gen_params.extra_args = extra_args # Parse per-request LoRA (compatible with chat's extra_body.lora shape). lora_request, lora_scale = _parse_lora_request(request.lora) _update_if_not_none(gen_params, "lora_request", lora_request) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 5f76bbd6b8..6a2dd43be5 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -81,6 +81,24 @@ def validate_layers(cls, v): # vllm-omni extensions for diffusion control negative_prompt: str | None = Field(default=None, description="Text describing what to avoid in the image") + system_prompt: str | None = Field( + default=None, description="Custom system prompt. Used when --use_system_prompt is custom" + ) + use_system_prompt: str | None = Field( + default=None, + description="System prompt type. Options: None, dynamic, en_vanilla, " + "en_recaption, en_think_recaption, en_unified, custom", + ) + + @field_validator("use_system_prompt") + @classmethod + def validate_use_system_prompt(cls, v): + """Validate system prompt type.""" + valid_types = [None, "dynamic", "en_vanilla", "en_recaption", "en_think_recaption", "en_unified", "custom"] + if v not in valid_types: + raise ValueError(f"Invalid use_system_prompt type: {v}. Must be one of: {valid_types[1:] + [None]}") + return v + num_inference_steps: int | None = Field( default=None, ge=1,