Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 54 additions & 49 deletions tests/models/language/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,62 +9,66 @@


def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get('trust_remote_code', False)
dtype = eval_config.get('dtype', 'bfloat16')
max_num_seqs = eval_config.get('max_num_seqs', 128)
tp_size = int(os.environ.get('TP_SIZE', '1'))
enable_apc = os.environ.get('ENABLE_APC', 'True').lower() in ['true', '1']
enforce_eager = os.environ.get('ENFORCE_EAGER', 'False').lower() in ['true', '1']
kv_cache_dtype = os.environ.get('KV_CACHE_DTYPE', None)
task = eval_config.get('tasks', 'gsm8k')
async_scheduling = os.environ.get('ASYNC_SCHEDULING', 'False').lower() in ['true', '1']
max_model_len = eval_config.get('max_model_len', 4096)
batch_size = eval_config.get('batch_size', 'auto')
trust_remote_code = eval_config.get("trust_remote_code", False)
dtype = eval_config.get("dtype", "bfloat16")
max_num_seqs = eval_config.get("max_num_seqs", 128)
tp_size = int(os.environ.get("TP_SIZE", "1"))
enable_apc = os.environ.get("ENABLE_APC", "True").lower() in ["true", "1"]
enforce_eager = eval_config.get("enforce_eager", False)
if "ENFORCE_EAGER" in os.environ:
enforce_eager = os.environ["ENFORCE_EAGER"].lower() in ["true", "1"]
kv_cache_dtype = os.environ.get("KV_CACHE_DTYPE", None)
task = eval_config.get("tasks", "gsm8k")
async_scheduling = os.environ.get("ASYNC_SCHEDULING", "False").lower() in ["true", "1"]
max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
model_args = {
'pretrained': eval_config['model_name'],
'tensor_parallel_size': tp_size,
'async_scheduling': async_scheduling,
'enforce_eager': enforce_eager,
'enable_prefix_caching': enable_apc,
'dtype': dtype,
'max_model_len': max_model_len,
'max_num_seqs': max_num_seqs,
'trust_remote_code': trust_remote_code,
'batch_size': batch_size,
'enable_expert_parallel': eval_config.get('enable_expert_parallel', False),
'chat_template_args': eval_config.get('chat_template_args', {}),
'seed': eval_config.get('seed', 42),
"pretrained": eval_config["model_name"],
"tensor_parallel_size": tp_size,
"async_scheduling": async_scheduling,
"enforce_eager": enforce_eager,
"enable_prefix_caching": enable_apc,
"dtype": dtype,
"max_model_len": max_model_len,
"max_num_seqs": max_num_seqs,
"trust_remote_code": trust_remote_code,
"batch_size": batch_size,
"enable_expert_parallel": eval_config.get("enable_expert_parallel", False),
"chat_template_args": eval_config.get("chat_template_args", {}),
"seed": eval_config.get("seed", 42),
}
if kv_cache_dtype is not None:
model_args['kv_cache_dtype'] = kv_cache_dtype
model_args["kv_cache_dtype"] = kv_cache_dtype

if eval_config.get('gpu_memory_utilization') is not None:
model_args['gpu_memory_utilization'] = eval_config['gpu_memory_utilization']
if eval_config.get('reasoning_parser') is not None:
model_args['reasoning_parser'] = eval_config['reasoning_parser']
if eval_config.get('max_num_batched_tokens') is not None:
model_args['max_num_batched_tokens'] = eval_config['max_num_batched_tokens']
if eval_config.get("gpu_memory_utilization") is not None:
model_args["gpu_memory_utilization"] = eval_config["gpu_memory_utilization"]
if eval_config.get("reasoning_parser") is not None:
model_args["reasoning_parser"] = eval_config["reasoning_parser"]
if eval_config.get("max_num_batched_tokens") is not None:
model_args["max_num_batched_tokens"] = eval_config["max_num_batched_tokens"]

if eval_config.get("inc"):
assert os.environ.get('QUANT_CONFIG', None), "must set QUANT_CONFIG environment variable for using INC"
model_args['quantization'] = 'inc' # for both calibration and quantization
assert os.environ.get("QUANT_CONFIG", None), "must set QUANT_CONFIG environment variable for using INC"
model_args["quantization"] = "inc" # for both calibration and quantization
if eval_config.get("fp8"): # for quantization in fp8
model_args['kv_cache_dtype'] = 'fp8_inc'
model_args["kv_cache_dtype"] = "fp8_inc"

kwargs = {}
if 'fewshot_as_multiturn' in eval_config:
kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn']
if 'apply_chat_template' in eval_config:
kwargs['apply_chat_template'] = eval_config['apply_chat_template']
if eval_config.get('max_gen_toks') is not None:
kwargs['gen_kwargs'] = f"max_gen_toks={eval_config['max_gen_toks']}"
if "fewshot_as_multiturn" in eval_config:
kwargs["fewshot_as_multiturn"] = eval_config["fewshot_as_multiturn"]
if "apply_chat_template" in eval_config:
kwargs["apply_chat_template"] = eval_config["apply_chat_template"]
if eval_config.get("max_gen_toks") is not None:
kwargs["gen_kwargs"] = f"max_gen_toks={eval_config['max_gen_toks']}"
llm = VLLM(**model_args)
results = lm_eval.simple_evaluate(model=llm,
tasks=[task],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto",
**kwargs)
results = lm_eval.simple_evaluate(
model=llm,
tasks=[task],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto",
**kwargs,
)
del llm
gc.collect()

Expand All @@ -75,11 +79,11 @@ def test_models(model_card_path, monkeypatch) -> None:
with open(model_card_path) as f:
model_card = yaml.safe_load(f)
print(f"{model_card=}")
model_config = model_card['model_card']
model_config = model_card["model_card"]
results = launch_lm_eval(model_config)
RTOL = 0.03
metric = model_card['metrics']
task = model_config['tasks']
metric = model_card["metrics"]
task = model_config["tasks"]
try:
measured_value = results["results"][task][metric["name"]]
except KeyError as e:
Expand All @@ -100,6 +104,7 @@ def __main__(args):

if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description="Test vLLM models with lm-eval")
parser.add_argument("--model_card_path", type=str, required=True, help="Path to the model card YAML file.")
args = parser.parse_args()
Expand Down
37 changes: 27 additions & 10 deletions vllm_gaudi/entrypoints/openai/multi_model_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import NamedTuple
from typing import Any, NamedTuple

import uvloop
import yaml
Expand Down Expand Up @@ -149,6 +149,19 @@ async def sleep(self, level: int = 1, mode: str = "abort") -> None:
async def wake_up(self, tags: list[str] | None = None) -> None:
await self._engine.wake_up(tags=tags)

async def notify_kv_transfer_request_rejected(
self,
request_id: str,
kv_transfer_params: dict[str, Any],
*,
data_parallel_rank: int | None = None,
) -> None:
await self._engine.notify_kv_transfer_request_rejected(
request_id=request_id,
kv_transfer_params=kv_transfer_params,
data_parallel_rank=data_parallel_rank,
)

async def is_sleeping(self) -> bool:
return await self._engine.is_sleeping()

Expand Down Expand Up @@ -299,7 +312,7 @@ def _resolve_frontend_settings(
if model_overrides.enable_auto_tool_choice is not None else args.enable_auto_tool_choice)
tool_call_parser = (model_overrides.tool_call_parser
if model_overrides.tool_call_parser is not None else args.tool_call_parser)
chat_template = (model_overrides.chat_template if model_overrides.chat_template is not None else args.chat_template)
chat_template = model_overrides.chat_template if model_overrides.chat_template is not None else args.chat_template
return FrontendSettings(
enable_auto_tool_choice=enable_auto_tool_choice,
tool_call_parser=tool_call_parser,
Expand All @@ -321,7 +334,7 @@ def _validate_model_frontend_overrides(
if effective_enable_auto and not effective_tool_parser:
raise ValueError(f"Model '{model_name}' enables auto tool choice but no tool_call_parser is set.")

if (effective_enable_auto and effective_tool_parser and effective_tool_parser not in valid_tool_parsers):
if effective_enable_auto and effective_tool_parser and effective_tool_parser not in valid_tool_parsers:
raise ValueError(f"Model '{model_name}' has invalid tool_call_parser='{effective_tool_parser}'. "
f"Valid options: {valid_tool_parsers}")

Expand Down Expand Up @@ -392,8 +405,7 @@ def _load_multi_model_config(path: str, ) -> MultiModelConfigLoadResult:
if default_model is None:
default_model = next(iter(model_configs.keys()))
if default_model not in model_configs:
raise ValueError(f"Default model '{default_model}' not found in config models: "
f"{list(model_configs.keys())}")
raise ValueError(f"Default model '{default_model}' not found in config models: {list(model_configs.keys())}")

return MultiModelConfigLoadResult(
model_configs=model_configs,
Expand All @@ -418,8 +430,13 @@ async def build_multi_model_engine_client(
args: Namespace,
*,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
) -> AsyncIterator[tuple[MultiModelEngineClient, MultiModelAsyncLLM, dict[str, BaseModelPath], dict[str, int], dict[
str, ModelFrontendOverrides]]]:
) -> AsyncIterator[tuple[
MultiModelEngineClient,
MultiModelAsyncLLM,
dict[str, BaseModelPath],
dict[str, int],
dict[str, ModelFrontendOverrides],
]]:
config_path = _resolve_multi_model_config_path()
if not config_path:
raise ValueError("A multi-model config path must be set when multi-model mode is enabled. "
Expand Down Expand Up @@ -504,7 +521,8 @@ async def _init_multi_model_state(
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
default_chat_template_kwargs=args.default_chat_template_kwargs,
trust_request_chat_template=args.trust_request_chat_template)
trust_request_chat_template=args.trust_request_chat_template,
)

if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import init_generate_state
Expand Down Expand Up @@ -537,8 +555,7 @@ async def _init_multi_model_state(

def _attach_multi_model_router(app: FastAPI) -> None:
if not envs.VLLM_SERVER_DEV_MODE:
logger.warning("The /v1/models/switch endpoint is disabled. "
"Set VLLM_SERVER_DEV_MODE=1 to enable it.")
logger.warning("The /v1/models/switch endpoint is disabled. Set VLLM_SERVER_DEV_MODE=1 to enable it.")
return

router = APIRouter()
Expand Down
12 changes: 1 addition & 11 deletions vllm_gaudi/ops/hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
FusedTopKRouter, )
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (
GroupedTopKRouter, )
from vllm.model_executor.layers.fused_moe.router.router_factory import (
EMPTY_EPLB_STATE, )
from vllm.model_executor.layers.fused_moe.router.routing_simulator_router import (
RoutingSimulatorRouter, )
from vllm.model_executor.layers.fused_moe.router.zero_expert_router import (
Expand Down Expand Up @@ -388,8 +386,7 @@ def create_fused_moe_router(
# custom routing parameters
custom_routing_function: Callable | None = None,
# eplb parameters
enable_eplb: bool = False,
eplb_state: EplbLayerState = EMPTY_EPLB_STATE,
eplb_state: EplbLayerState | None = None,
# zero expert parameters
zero_expert_type: str | None = None,
num_logical_experts: int | None = None,
Expand Down Expand Up @@ -428,7 +425,6 @@ def create_fused_moe_router(
custom_routing_function: Optional custom routing function

EPLB arguments:
enable_eplb: Whether EPLB is enabled
eplb_state: EPLB (Expert Parallelism Load Balancing) state

Zero expert arguments:
Expand All @@ -451,7 +447,6 @@ def create_fused_moe_router(
top_k=top_k,
global_num_experts=global_num_experts,
eplb_state=eplb_state,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)

Expand All @@ -468,7 +463,6 @@ def create_fused_moe_router(
scoring_func=scoring_func,
renormalize=renormalize,
routed_scaling_factor=routed_scaling_factor,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)

Expand All @@ -487,7 +481,6 @@ def create_fused_moe_router(
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
num_fused_shared_experts=num_fused_shared_experts,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)
return grouped_topk_router
Expand All @@ -499,7 +492,6 @@ def create_fused_moe_router(
eplb_state=eplb_state,
custom_routing_function=custom_routing_function,
renormalize=renormalize,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)

Expand All @@ -514,7 +506,6 @@ def create_fused_moe_router(
scoring_func=scoring_func,
renormalize=renormalize,
routed_scaling_factor=routed_scaling_factor,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
hash_indices_table=hash_indices_table,
)
Expand All @@ -525,7 +516,6 @@ def create_fused_moe_router(
eplb_state=eplb_state,
renormalize=renormalize,
scoring_func=scoring_func,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)

Expand Down
Loading