sgl-project · mickqian · Dec 14, 2025 · Oct 21, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
@@ -820,6 +820,7 @@ def get_dataset(args, tokenizer, model_id=None):
             image_format=args.image_format,
             image_resolution=args.image_resolution,
             backend=args.backend,
+            random_image_count=args.random_image_count,
         )
     elif args.dataset_name == "generated-shared-prefix":
         assert not tokenize_prompt
@@ -1474,10 +1475,12 @@ def sample_image_requests(
     image_format: str,
     image_resolution: str,
     backend: str,
+    random_image_count: bool = False,
 ) -> List[DatasetRow]:
     """Generate requests with images.
 
-    - Each request includes ``image_count`` images.
+    - If ``random_image_count`` is True, each request includes a random number of images between 1 and ``image_count``.
+    - If ``random_image_count`` is False, each request includes exactly ``image_count`` images.
     - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
       or custom 'heightxwidth' (e.g., 1080x1920).
     - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
@@ -1487,10 +1490,20 @@ def sample_image_requests(
     # Parse resolution (supports presets and 'heightxwidth')
     width, height = parse_image_resolution(image_resolution)
 
+    # Determine image counts for each request
+    if random_image_count:
+        # Random number of images per request
+        image_counts = np.random.randint(1, image_count + 1, size=num_requests)
+        total_images = np.sum(image_counts)
+    else:
+        # Fixed number of images per request
+        image_counts = np.full(num_requests, image_count)
+        total_images = image_count * num_requests
+
     # Check for potentially problematic combinations and warn user
-    if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
+    if width * height >= 1920 * 1080 and total_images >= 100:
         warnings.warn(
-            f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
+            f"High resolution ({width}x{height}) with {total_images} total images "
             f"may take a long time. Consider reducing resolution or image count.",
             UserWarning,
             stacklevel=2,
@@ -1528,6 +1541,9 @@ def _gen_random_image_data_uri(
     dataset: List[DatasetRow] = []
     total_image_bytes = 0
     for i in range(num_requests):
+        # Get the number of images for this request
+        request_image_count = int(image_counts[i])
+
         # Generate text prompt
         text_prompt = gen_mm_prompt(
             processor.tokenizer,
@@ -1537,7 +1553,7 @@ def _gen_random_image_data_uri(
 
         # Generate image list
         images, images_base64, images_bytes = zip(
-            *[_gen_random_image_data_uri() for _ in range(image_count)]
+            *[_gen_random_image_data_uri() for _ in range(request_image_count)]
         )
         total_image_bytes += sum(list(images_bytes))
 
@@ -1549,11 +1565,20 @@ def _gen_random_image_data_uri(
             processor,
             backend,
         )
-
         dataset.append(data_row)
 
+    # Print statistics
     print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
     print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
+    print(f"#Total images: {total_images}")
+
+    if random_image_count:
+        print(
+            f"#Images per request: min={np.min(image_counts)}, max={np.max(image_counts)}, mean={np.mean(image_counts):.2f}"
+        )
+    else:
+        print(f"#Images per request: {image_count} (fixed)")
+
     print(
         f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
     )
@@ -2700,6 +2725,11 @@ def __call__(self, parser, namespace, values, option_string=None):
             "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
         ),
     )
+    parser.add_argument(
+        "--random-image-count",
+        action="store_true",
+        help="Enable Random Image Count",
+    )
     parser.add_argument(
         "--image-format",
         type=str,

diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py
@@ -9,11 +9,15 @@
 
 
 def run_server(server_args):
-    """Run the server based on server_args.grpc_mode."""
+    """Run the server based on server_args.grpc_mode and server_args.encoder_only."""
     if server_args.grpc_mode:
         from sglang.srt.entrypoints.grpc_server import serve_grpc
 
         asyncio.run(serve_grpc(server_args))
+    elif server_args.encoder_only:
+        from sglang.srt.disaggregation.encode_server import launch_server
+
+        launch_server(server_args)
     else:
         # Default mode: HTTP mode.
         from sglang.srt.entrypoints.http_server import launch_server

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -100,6 +100,8 @@ def __init__(
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
         sampling_defaults: str = "openai",
         quantize_and_serve: bool = False,
+        encoder_only: bool = False,
+        language_only: bool = False,
     ) -> None:
         # Parse args
         self.model_path = model_path
@@ -216,6 +218,9 @@ def __init__(
             self.hf_config, "image_token_id", None
         ) or getattr(self.hf_config, "image_token_index", None)
 
+        self.hf_config.encoder_only = encoder_only
+        self.hf_config.language_only = language_only
+
         # matryoshka embeddings
         self.matryoshka_dimensions = getattr(
             self.hf_config, "matryoshka_dimensions", None
@@ -246,6 +251,8 @@ def from_server_args(
             sampling_defaults=server_args.sampling_defaults,
             quantize_and_serve=server_args.quantize_and_serve,
             override_config_file=server_args.decrypted_config_file,
+            language_only=server_args.language_only,
+            encoder_only=server_args.encoder_only,
             **kwargs,
         )