Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
92e4a99
Implement EPD disaggregation
gty111 Oct 21, 2025
8c57bea
Fix model weights loading
gty111 Oct 22, 2025
104f763
Use zmq for transmitting embedding
gty111 Oct 22, 2025
2cfccfe
Fix MM embedding index
gty111 Oct 23, 2025
5738fd1
Fix split mm items
gty111 Oct 23, 2025
ad2c155
Fix MM embedding index
gty111 Oct 24, 2025
8cda7bf
Fix encoder OOM
gty111 Oct 24, 2025
93205ff
Clean up code
gty111 Oct 24, 2025
eaa3f17
Fix EP colocate and add prefill server ip arg
gty111 Oct 24, 2025
66090ad
Fix race condition
gty111 Oct 24, 2025
90f8507
Batch MM items again (OOM fixed by core binding)
gty111 Oct 27, 2025
dcab59a
Enable assign prefill IP on the fly
gty111 Oct 27, 2025
3a064cd
Fix rebase
gty111 Oct 27, 2025
f17fcb3
Fix image resize for Qwen
gty111 Oct 28, 2025
ec16b9b
Format
gty111 Oct 28, 2025
c56adb8
Format
gty111 Oct 28, 2025
1d68c52
Support qwen3_vl
gty111 Oct 29, 2025
25e8568
Support qwen3_vl_moe
gty111 Oct 29, 2025
714bad3
Support [E]+[PD colocate]
gty111 Oct 31, 2025
2c452f0
Support mooncake for transmission
gty111 Nov 4, 2025
88b74bd
Fix embedding shape
gty111 Nov 5, 2025
12a9451
Fix ib_device,mm_embedding shape,format
gty111 Nov 5, 2025
bf71817
Fix router args
gty111 Nov 5, 2025
c93c13d
Add mm_transfer_backend (zmq or mooncake)
gty111 Nov 5, 2025
b8eccf7
Fix embedding_port
gty111 Nov 6, 2025
da52835
Fix Qwen3-Omni
gty111 Nov 6, 2025
0321a09
Add params check and health check
gty111 Nov 6, 2025
d258499
Fix gpu_id, send_to_prefill_sockets
gty111 Nov 6, 2025
ecdb6ca
Support dots.vlm
gty111 Nov 6, 2025
a542377
Save prefill preprocess time for E disaggregation
gty111 Nov 7, 2025
7c1500e
feat: remove image URLs from prefill requests in EPD mode to reduce c…
liusy58 Nov 7, 2025
82258df
lint
gty111 Nov 11, 2025
a96594d
Rebase and clean up code
gty111 Nov 19, 2025
6b21c4f
Fix import
gty111 Nov 19, 2025
3a77446
Fix rebase
gty111 Nov 19, 2025
b7bed53
Revert changes
gty111 Nov 19, 2025
88c2884
Fix qwen2_5_vl
gty111 Nov 20, 2025
b69aaee
Get rid of the dependency on minlb
gty111 Nov 20, 2025
59d3c02
[feat] use `--random-image-count` to generate requests contain images…
liusy58 Nov 20, 2025
c1851b6
fix typo
liusy58 Nov 20, 2025
dabc6b3
Fix OOM for qwen3
gty111 Nov 20, 2025
43b32b3
Fix import
gty111 Nov 20, 2025
8377781
Remove async lock
gty111 Nov 21, 2025
88cdbb5
Support TP encoder
gty111 Nov 21, 2025
685cae1
lint
gty111 Nov 21, 2025
cbe0124
support encoder send mmdata to scheduler directly.
liusy58 Nov 23, 2025
7c90f78
[fix] support --dist-init-addr
liusy58 Nov 24, 2025
484c647
Refactor and fix bugs (Add zmq_s zmq_t and mooncake backend)
gty111 Nov 25, 2025
868909a
Fix import
gty111 Nov 25, 2025
bb562bf
feat: support prefix_mm_cache
ZhengWG Nov 25, 2025
dec13c1
Lint
gty111 Nov 26, 2025
23c3293
Avoid time-consuming CPU concat
gty111 Nov 27, 2025
725318b
Rename zmq_s and zmq_t; Fix get_local_ip_by_remote
gty111 Nov 27, 2025
d3de248
Add waiting list for zmq_to_scheduler
gty111 Nov 27, 2025
4df4996
Fix duplicate disagg_mode
gty111 Nov 27, 2025
43101d6
use thread for receiving mm data (#4)
liusy58 Dec 1, 2025
4381201
Fix port and transfer backend (#5)
gty111 Dec 1, 2025
355e6e8
Fix comments (#6)
gty111 Dec 2, 2025
eaaf840
feat: add health/health_generate (#7)
ZhengWG Dec 3, 2025
02d5dfc
clean code for mm_cache && add para-check (#8)
ZhengWG Dec 3, 2025
4c74efc
Fix comments (#10)
gty111 Dec 5, 2025
51afd45
fix: skip init emeb_tokens && skip send_encode without mm_inputs (#11)
ZhengWG Dec 6, 2025
c47bfce
[CI]: add EPD disaggregation integration tests (#9)
ZhengWG Dec 7, 2025
b6a5872
Merge branch 'main' into epd_rebase
ShangmingCai Dec 8, 2025
079f5d9
Merge branch 'main' into epd_rebase
gty111 Dec 9, 2025
67157bd
Merge branch 'main' into epd_rebase
liusy58 Dec 9, 2025
f990b42
Merge branch 'main' into epd_rebase
liusy58 Dec 9, 2025
eeadb68
Merge branch 'main' into epd_rebase
gty111 Dec 10, 2025
f3ceb9f
Lint
gty111 Dec 10, 2025
0b08b79
Merge branch 'main' into epd_rebase
gty111 Dec 10, 2025
b7b1ed8
Merge branch 'main' into epd_rebase
yhyang201 Dec 11, 2025
f2195db
ut: speed up epd_dis ut (#12)
ZhengWG Dec 12, 2025
b105ec1
Fix port allocation and other lints (#13)
gty111 Dec 13, 2025
05a0fbb
Merge branch 'main' into epd_rebase
gty111 Dec 13, 2025
6b5ded7
Fix merge
gty111 Dec 13, 2025
c9126ed
fix test
liusy58 Dec 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,7 @@ def get_dataset(args, tokenizer, model_id=None):
image_format=args.image_format,
image_resolution=args.image_resolution,
backend=args.backend,
random_image_count=args.random_image_count,
)
elif args.dataset_name == "generated-shared-prefix":
assert not tokenize_prompt
Expand Down Expand Up @@ -1474,10 +1475,12 @@ def sample_image_requests(
image_format: str,
image_resolution: str,
backend: str,
random_image_count: bool = False,
) -> List[DatasetRow]:
"""Generate requests with images.

- Each request includes ``image_count`` images.
- If ``random_image_count`` is True, each request includes a random number of images between 1 and ``image_count``.
- If ``random_image_count`` is False, each request includes exactly ``image_count`` images.
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
or custom 'heightxwidth' (e.g., 1080x1920).
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
Expand All @@ -1487,10 +1490,20 @@ def sample_image_requests(
# Parse resolution (supports presets and 'heightxwidth')
width, height = parse_image_resolution(image_resolution)

# Determine image counts for each request
if random_image_count:
# Random number of images per request
image_counts = np.random.randint(1, image_count + 1, size=num_requests)
total_images = np.sum(image_counts)
else:
# Fixed number of images per request
image_counts = np.full(num_requests, image_count)
total_images = image_count * num_requests

# Check for potentially problematic combinations and warn user
if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
if width * height >= 1920 * 1080 and total_images >= 100:
warnings.warn(
f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
f"High resolution ({width}x{height}) with {total_images} total images "
f"may take a long time. Consider reducing resolution or image count.",
UserWarning,
stacklevel=2,
Expand Down Expand Up @@ -1528,6 +1541,9 @@ def _gen_random_image_data_uri(
dataset: List[DatasetRow] = []
total_image_bytes = 0
for i in range(num_requests):
# Get the number of images for this request
request_image_count = int(image_counts[i])

# Generate text prompt
text_prompt = gen_mm_prompt(
processor.tokenizer,
Expand All @@ -1537,7 +1553,7 @@ def _gen_random_image_data_uri(

# Generate image list
images, images_base64, images_bytes = zip(
*[_gen_random_image_data_uri() for _ in range(image_count)]
*[_gen_random_image_data_uri() for _ in range(request_image_count)]
)
total_image_bytes += sum(list(images_bytes))

Expand All @@ -1549,11 +1565,20 @@ def _gen_random_image_data_uri(
processor,
backend,
)

dataset.append(data_row)

# Print statistics
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
print(f"#Total images: {total_images}")

if random_image_count:
print(
f"#Images per request: min={np.min(image_counts)}, max={np.max(image_counts)}, mean={np.mean(image_counts):.2f}"
)
else:
print(f"#Images per request: {image_count} (fixed)")

print(
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
)
Expand Down Expand Up @@ -2700,6 +2725,11 @@ def __call__(self, parser, namespace, values, option_string=None):
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
),
)
parser.add_argument(
"--random-image-count",
action="store_true",
help="Enable Random Image Count",
)
parser.add_argument(
"--image-format",
type=str,
Expand Down
6 changes: 5 additions & 1 deletion python/sglang/launch_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@


def run_server(server_args):
"""Run the server based on server_args.grpc_mode."""
"""Run the server based on server_args.grpc_mode and server_args.encoder_only."""
if server_args.grpc_mode:
from sglang.srt.entrypoints.grpc_server import serve_grpc

asyncio.run(serve_grpc(server_args))
elif server_args.encoder_only:
from sglang.srt.disaggregation.encode_server import launch_server

launch_server(server_args)
else:
# Default mode: HTTP mode.
from sglang.srt.entrypoints.http_server import launch_server
Expand Down
7 changes: 7 additions & 0 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def __init__(
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
sampling_defaults: str = "openai",
quantize_and_serve: bool = False,
encoder_only: bool = False,
language_only: bool = False,
) -> None:
# Parse args
self.model_path = model_path
Expand Down Expand Up @@ -216,6 +218,9 @@ def __init__(
self.hf_config, "image_token_id", None
) or getattr(self.hf_config, "image_token_index", None)

self.hf_config.encoder_only = encoder_only
self.hf_config.language_only = language_only

# matryoshka embeddings
self.matryoshka_dimensions = getattr(
self.hf_config, "matryoshka_dimensions", None
Expand Down Expand Up @@ -246,6 +251,8 @@ def from_server_args(
sampling_defaults=server_args.sampling_defaults,
quantize_and_serve=server_args.quantize_and_serve,
override_config_file=server_args.decrypted_config_file,
language_only=server_args.language_only,
encoder_only=server_args.encoder_only,
**kwargs,
)

Expand Down
Loading
Loading