Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vllm_mlx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def serve_command(args):
)
sys.exit(1)

# Pass alias info to server (for /v1/models)
server._model_alias = getattr(args, "_original_alias", None)

# Configure server security settings
server._api_key = args.api_key
server._default_timeout = args.timeout
Expand Down Expand Up @@ -1124,6 +1127,7 @@ def main():
resolved = resolve_model(args.model)
if resolved != args.model:
print(f" Alias: {args.model} → {resolved}")
args._original_alias = args.model
args.model = resolved

if args.command == "serve":
Expand Down
41 changes: 23 additions & 18 deletions vllm_mlx/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
# Global engine instance
_engine: BaseEngine | None = None
_model_name: str | None = None
_model_alias: str | None = None # Short alias used to start the model (if any)
_default_max_tokens: int = 32768
_default_timeout: float = 300.0 # Default request timeout in seconds (5 minutes)
_default_temperature: float | None = None # Set via --default-temperature
Expand Down Expand Up @@ -436,11 +437,10 @@ async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(sec
global _auth_warning_logged

if _api_key is None:
# Log warning once about running without authentication
# Log once at debug level — local inference rarely needs auth
if not _auth_warning_logged:
logger.warning(
"SECURITY WARNING: Server running without API key authentication. "
"Anyone can access the API. Use --api-key to enable authentication."
logger.debug(
"No API key configured. Use --api-key to enable authentication."
)
_auth_warning_logged = True
return True # No auth required
Expand Down Expand Up @@ -995,6 +995,9 @@ async def list_models() -> ModelsResponse:
models = []
if _model_name:
models.append(ModelInfo(id=_model_name))
# Also list the alias if the model was loaded via one
if _model_alias and _model_alias != _model_name:
models.append(ModelInfo(id=_model_alias))
return ModelsResponse(data=models)


Expand Down Expand Up @@ -1611,7 +1614,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
)

comp_response = CompletionResponse(
model=request.model,
model=_model_name or request.model,
choices=choices,
usage=Usage(
prompt_tokens=total_prompt_tokens,
Expand Down Expand Up @@ -2043,7 +2046,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
choice_logprobs = ChoiceLogProbs(content=token_logprobs_list)

chat_response = ChatCompletionResponse(
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChoice(
message=AssistantMessage(
Expand Down Expand Up @@ -2211,7 +2214,7 @@ async def create_anthropic_message(

# Build OpenAI response to convert
openai_response = ChatCompletionResponse(
model=openai_request.model,
model=_model_name or openai_request.model,
choices=[
ChatCompletionChoice(
message=AssistantMessage(
Expand All @@ -2229,7 +2232,9 @@ async def create_anthropic_message(
)

# Convert to Anthropic response
anthropic_response = openai_to_anthropic(openai_response, anthropic_request.model)
anthropic_response = openai_to_anthropic(
openai_response, _model_name or anthropic_request.model
)
return Response(
content=anthropic_response.model_dump_json(exclude_none=True),
media_type="application/json",
Expand Down Expand Up @@ -2345,7 +2350,7 @@ async def _stream_anthropic_messages(
"id": msg_id,
"type": "message",
"role": "assistant",
"model": anthropic_request.model,
"model": _model_name or anthropic_request.model,
"content": [],
"stop_reason": None,
"stop_sequence": None,
Expand Down Expand Up @@ -2473,7 +2478,7 @@ async def stream_completion(
"id": f"cmpl-{uuid.uuid4().hex[:8]}",
"object": "text_completion",
"created": int(time.time()),
"model": request.model,
"model": _model_name or request.model,
"choices": [
{
"index": 0,
Expand Down Expand Up @@ -2524,7 +2529,7 @@ def _build_chunk_logprobs(output: GenerationOutput) -> ChoiceLogProbs | None:
# Pre-compute SSE template parts that don't change per-token.
# This avoids repeated f-string interpolation and time.time() syscalls.
_sse_created = int(time.time())
_model_escaped = json.dumps(request.model) # Properly escape quotes/newlines
_model_escaped = json.dumps(_model_name or request.model)
_sse_prefix = (
f'data: {{"id":"{response_id}","object":"chat.completion.chunk",'
f'"created":{_sse_created},"model":{_model_escaped},'
Expand Down Expand Up @@ -2670,7 +2675,7 @@ def _fast_sse_chunk(
tool_calls_detected = True
chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand Down Expand Up @@ -2724,7 +2729,7 @@ def _fast_sse_chunk(

chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand Down Expand Up @@ -2780,7 +2785,7 @@ def _fast_sse_chunk(
tool_calls_detected = True
chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand Down Expand Up @@ -2837,7 +2842,7 @@ def _fast_sse_chunk(

chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand All @@ -2858,7 +2863,7 @@ def _fast_sse_chunk(
if correction and correction.content:
correction_chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand Down Expand Up @@ -2888,7 +2893,7 @@ def _fast_sse_chunk(
if result.tools_called:
tool_chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
Expand Down Expand Up @@ -2924,7 +2929,7 @@ def _fast_sse_chunk(
if include_usage:
usage_chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
model=_model_name or request.model,
choices=[], # Empty choices for usage-only chunk
usage=Usage(
prompt_tokens=prompt_tokens,
Expand Down
Loading