Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions benchmark/prefill_only/bench_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
SGLang Embeddings Benchmark Script

This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.

Features:
- HTTP-only implementation
- Uses /v1/embeddings API endpoint directly
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions

Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_embeddings.py
"""

import asyncio
import logging

from transformers import AutoTokenizer
from util import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [500]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False
config.freeze_gc = True # Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"

# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/embeddings"

# Embeddings API Config
EMBEDDINGS_MODEL_PATH = "/Qwen/Qwen3-Embedding-0.6B"
BATCH_SIZE = [1] # Number of items per request (batch size)

# Configurable input token length
EMBEDDINGS_INPUT_TOKENS = 500 # Default token length

# Load tokenizer once for embeddings text generation
print("Loading tokenizer for embeddings input generation...")
embeddings_tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL_PATH)

# Generate input text with the specified token length using pre-loaded tokenizer
EMBEDDINGS_INPUT_TEXT = generate_text_with_token_count(
EMBEDDINGS_MODEL_PATH,
EMBEDDINGS_INPUT_TOKENS,
config.special_replicated_token,
tokenizer=embeddings_tokenizer,
)


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def build_embeddings_request(index: int, item_count: int) -> tuple:
"""Build a single embeddings request."""
try:
# For embeddings, input can be a string or list of strings
if item_count == 1:
input_data = EMBEDDINGS_INPUT_TEXT
else:
input_data = [EMBEDDINGS_INPUT_TEXT for _ in range(item_count)]
req = {
"input": input_data,
"model": EMBEDDINGS_MODEL_PATH,
}
return (index, req)
except Exception as e:
logger.error(f"Error building request {index}: {e}")
return (index, None)


def validate_embeddings_response(response_data: dict) -> bool:
"""Validate embeddings API response."""
return "data" in response_data


def build_warmup_embeddings_request() -> dict:
"""Build a warmup request for the embeddings API."""
return {
"input": EMBEDDINGS_INPUT_TEXT,
"model": EMBEDDINGS_MODEL_PATH,
}


###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single embeddings benchmark with the given RPS value."""
return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_embeddings_request,
response_validator=validate_embeddings_response,
api_name="EMBEDDINGS",
request_description="embeddings requests",
)


async def main():
additional_info = {
"Input text length": f"{EMBEDDINGS_INPUT_TOKENS} tokens",
"Input text preview": (
EMBEDDINGS_INPUT_TEXT[:100] + "..."
if len(EMBEDDINGS_INPUT_TEXT) > 100
else EMBEDDINGS_INPUT_TEXT
),
}

await run_benchmark_main(
config,
run_benchmark,
"EMBEDDINGS",
HTTP_URL,
BATCH_SIZE,
additional_info,
build_warmup_embeddings_request,
)


if __name__ == "__main__":
asyncio.run(main())
192 changes: 192 additions & 0 deletions benchmark/prefill_only/bench_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
SGLang Scoring Benchmark Script

This script benchmarks SGLang's scoring API performance using HTTP requests.

Current Features:
- HTTP-only implementation (open source compatible)
- Uses /v1/score API endpoint directly
- Single item scoring with batching support
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions

Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_score.py
- Each request will contain ITEM_COUNT_VALUES items for batch scoring

"""

import asyncio

from transformers import AutoTokenizer
from util import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)

###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [160]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False
config.freeze_gc = True # Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"

# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly

# Score API Config
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
SCORE_QUERY_TOKENS = 120
SCORE_ITEM_TOKENS = 180
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs
ITEM_COUNT_VALUES = [10] # Number of items per request

# Special token to replicate for precise token counting
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def create_score_request_builder():
"""Create a score request builder function with shared tokenizer."""
# Load tokenizer once here to verify special token and get precise counts
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)

# Verify that our special token produces exactly 1 token
special_token_count = len(
tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
)
print(
f"Special token '{config.special_replicated_token}' produces "
f"{special_token_count} token(s)"
)

def generate_text_with_token_count_local(num_toks):
"""Generate text with precise token count using replicated token."""
return generate_text_with_token_count(
SCORE_MODEL_PATH,
num_toks,
config.special_replicated_token,
tokenizer=tokenizer,
)

def build_score_request(index: int, item_count: int) -> tuple:
"""Build a single score request."""
try:
# Generate query and items for score API
query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
items = [
generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
for _ in range(item_count)
]

# Return as dict for score API format
score_data = {
"query": query,
"items": items,
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
"model": SCORE_MODEL_PATH,
}
return (index, score_data)

except Exception as e:
print(f"Error building request {index}: {e}")
return (index, None)

return build_score_request


def validate_score_response(response_data: dict) -> bool:
"""Validate score API response."""
return "scores" in response_data or "logprobs" in response_data


def build_warmup_score_request() -> dict:
"""Build a warmup request for the score API."""
# Load tokenizer once for warmup generation
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)

warmup_query = generate_text_with_token_count(
SCORE_MODEL_PATH,
SCORE_QUERY_TOKENS,
config.special_replicated_token,
tokenizer=tokenizer,
)
warmup_items = [
generate_text_with_token_count(
SCORE_MODEL_PATH,
SCORE_ITEM_TOKENS,
config.special_replicated_token,
tokenizer=tokenizer,
)
for _ in range(3)
]

return {
"query": warmup_query,
"items": warmup_items,
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
"model": SCORE_MODEL_PATH,
# Add missing parameters for consistency with the original warmup
"apply_softmax": True,
"item_first": False,
}


###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single benchmark with the given RPS value."""
# Create the request builder function with shared tokenizer
build_request_func = create_score_request_builder()

return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_request_func,
response_validator=validate_score_response,
api_name="SINGLE_ITEM_SCORING",
request_description="score requests",
)


async def main():
"""Main function that runs benchmarks for all RPS values."""
additional_info = {
"Query tokens per request": SCORE_QUERY_TOKENS,
"Item tokens per item": SCORE_ITEM_TOKENS,
}

await run_benchmark_main(
config,
run_benchmark,
"SINGLE_ITEM_SCORING",
HTTP_URL,
ITEM_COUNT_VALUES,
additional_info,
build_warmup_score_request,
)


if __name__ == "__main__":
asyncio.run(main())
Loading
Loading