Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
a193c9f
Add OpenAI-Compatible models, completions, chat/completions endpoints
bbrowning Apr 8, 2025
92fdf6d
Use our own pydantic models for OpenAI Server APIs
bbrowning Apr 8, 2025
5bc5fed
Clean up some more usage of direct OpenAI types
bbrowning Apr 8, 2025
1dbdff1
ollama OpenAI-compatible completions and chat completions
bbrowning Apr 8, 2025
00c4493
OpenAI-compatible completions and chats for litellm and together
bbrowning Apr 8, 2025
15d37fd
Add unsupported OpenAI mixin to all remaining inference providers
bbrowning Apr 8, 2025
de01b14
Passthrough inference support for OpenAI-compatible APIs
bbrowning Apr 9, 2025
24cfa1e
Mark inline vllm as OpenAI unsupported inference
bbrowning Apr 9, 2025
a6cf8fa
OpenAI completion prompt can also be an array
bbrowning Apr 9, 2025
fcdeb3d
OpenAI completion prompt can also include tokens
bbrowning Apr 9, 2025
a1e9cff
Update spec with latest changes as well
bbrowning Apr 9, 2025
52b4766
Start some integration tests with an OpenAI client
bbrowning Apr 9, 2025
ef684ff
Fix openai_completion tests for ollama
bbrowning Apr 9, 2025
ac5dc8f
Add prompt_logprobs and guided_choice to OpenAI completions
bbrowning Apr 9, 2025
8d10556
Add basic tests for OpenAI Chat Completions API
bbrowning Apr 9, 2025
8f5cd49
vllm prompt_logprobs can also be 0
bbrowning Apr 9, 2025
a5827f7
Nvidia provider support for OpenAI API endpoints
bbrowning Apr 10, 2025
ffae192
Bug fixes for together.ai OpenAI endpoints
bbrowning Apr 10, 2025
31181c0
Fireworks provider support for OpenAI API endpoints
bbrowning Apr 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
932 changes: 932 additions & 0 deletions docs/_static/llama-stack-spec.html

Large diffs are not rendered by default.

665 changes: 665 additions & 0 deletions docs/_static/llama-stack-spec.yaml

Large diffs are not rendered by default.

313 changes: 313 additions & 0 deletions llama_stack/apis/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,217 @@ class EmbeddingsResponse(BaseModel):
embeddings: List[List[float]]


@json_schema_type
class OpenAIUserMessageParam(BaseModel):
"""A message from the user in an OpenAI-compatible chat completion request.

:param role: Must be "user" to identify this as a user message
:param content: The content of the message, which can include text and other media
:param name: (Optional) The name of the user message participant.
"""

role: Literal["user"] = "user"
content: InterleavedContent
name: Optional[str] = None


@json_schema_type
class OpenAISystemMessageParam(BaseModel):
"""A system message providing instructions or context to the model.

:param role: Must be "system" to identify this as a system message
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
:param name: (Optional) The name of the system message participant.
"""

role: Literal["system"] = "system"
content: InterleavedContent
name: Optional[str] = None


@json_schema_type
class OpenAIAssistantMessageParam(BaseModel):
"""A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.

:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param name: (Optional) The name of the assistant message participant.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""

role: Literal["assistant"] = "assistant"
content: InterleavedContent
name: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)


@json_schema_type
class OpenAIToolMessageParam(BaseModel):
"""A message representing the result of a tool invocation in an OpenAI-compatible chat completion request.

:param role: Must be "tool" to identify this as a tool response
:param tool_call_id: Unique identifier for the tool call this response is for
:param content: The response content from the tool
"""

role: Literal["tool"] = "tool"
tool_call_id: str
content: InterleavedContent


@json_schema_type
class OpenAIDeveloperMessageParam(BaseModel):
"""A message from the developer in an OpenAI-compatible chat completion request.

:param role: Must be "developer" to identify this as a developer message
:param content: The content of the developer message
:param name: (Optional) The name of the developer message participant.
"""

role: Literal["developer"] = "developer"
content: InterleavedContent
name: Optional[str] = None


OpenAIMessageParam = Annotated[
Union[
OpenAIUserMessageParam,
OpenAISystemMessageParam,
OpenAIAssistantMessageParam,
OpenAIToolMessageParam,
OpenAIDeveloperMessageParam,
],
Field(discriminator="role"),
]
register_schema(OpenAIMessageParam, name="OpenAIMessageParam")


@json_schema_type
class OpenAITopLogProb(BaseModel):
"""The top log probability for a token from an OpenAI-compatible chat completion response.

:token: The token
:bytes: (Optional) The bytes for the token
:logprob: The log probability of the token
"""

token: str
bytes: Optional[List[int]] = None
logprob: float


@json_schema_type
class OpenAITokenLogProb(BaseModel):
"""The log probability for a token from an OpenAI-compatible chat completion response.

:token: The token
:bytes: (Optional) The bytes for the token
:logprob: The log probability of the token
:top_logprobs: The top log probabilities for the token
"""

token: str
bytes: Optional[List[int]] = None
logprob: float
top_logprobs: List[OpenAITopLogProb]


@json_schema_type
class OpenAIChoiceLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.

:content: (Optional) The log probabilities for the tokens in the message
:refusal: (Optional) The log probabilities for the tokens in the message
"""

content: Optional[List[OpenAITokenLogProb]] = None
refusal: Optional[List[OpenAITokenLogProb]] = None


@json_schema_type
class OpenAIChoice(BaseModel):
"""A choice from an OpenAI-compatible chat completion response.

:param message: The message from the model
:param finish_reason: The reason the model stopped generating
:index: The index of the choice
:logprobs: (Optional) The log probabilities for the tokens in the message
"""

message: OpenAIMessageParam
finish_reason: str
index: int
logprobs: Optional[OpenAIChoiceLogprobs] = None


@json_schema_type
class OpenAIChatCompletion(BaseModel):
"""Response from an OpenAI-compatible chat completion request.

:param id: The ID of the chat completion
:param choices: List of choices
:param object: The object type, which will be "chat.completion"
:param created: The Unix timestamp in seconds when the chat completion was created
:param model: The model that was used to generate the chat completion
"""

id: str
choices: List[OpenAIChoice]
object: Literal["chat.completion"] = "chat.completion"
created: int
model: str


@json_schema_type
class OpenAICompletionLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible completion response.

:text_offset: (Optional) The offset of the token in the text
:token_logprobs: (Optional) The log probabilities for the tokens
:tokens: (Optional) The tokens
:top_logprobs: (Optional) The top log probabilities for the tokens
"""

text_offset: Optional[List[int]] = None
token_logprobs: Optional[List[float]] = None
tokens: Optional[List[str]] = None
top_logprobs: Optional[List[Dict[str, float]]] = None


@json_schema_type
class OpenAICompletionChoice(BaseModel):
"""A choice from an OpenAI-compatible completion response.

:finish_reason: The reason the model stopped generating
:text: The text of the choice
:index: The index of the choice
:logprobs: (Optional) The log probabilities for the tokens in the choice
"""

finish_reason: str
text: str
index: int
logprobs: Optional[OpenAIChoiceLogprobs] = None


@json_schema_type
class OpenAICompletion(BaseModel):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just import from openai.types.chat as we did in openai_compat.py?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually started with that. However, the API codegen wasn't able to successfully run with those types. I don't recall the exact errors now, but I can try an example out in a bit just to document what the actual issue was there. A secondary concern would be whether we want direct control over the public-facing API of Llama Stack or whether we want to let new versions of the OpenAI python client impact our API surface.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example of the kinds of errors the API spec codegen throws when using any of the OpenAI python client's types in our API:

Traceback (most recent call last):
  File "/Users/bbrowning/.pyenv/versions/3.10.16/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/bbrowning/.pyenv/versions/3.10.16/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/generate.py", line 91, in <module>
    fire.Fire(main)
  File "/Users/bbrowning/.cache/uv/archive-v0/fsDRrIpMBoxSdg6tsSQLY/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
  File "/Users/bbrowning/.cache/uv/archive-v0/fsDRrIpMBoxSdg6tsSQLY/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
  File "/Users/bbrowning/.cache/uv/archive-v0/fsDRrIpMBoxSdg6tsSQLY/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/generate.py", line 55, in main
    spec = Specification(
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/utility.py", line 29, in __init__
    self.document = generator.generate()
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 781, in generate
    operation = self._build_operation(op)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 691, in _build_operation
    responses = response_builder.build_response(response_options)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 374, in build_response
    responses[status_code] = self._build_response(
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 393, in _build_response
    content=self.content_builder.build_content(response_type, examples),
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 216, in build_content
    return {media_type: self.build_media_type(item_type, examples)}
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 221, in build_media_type
    schema = self.schema_builder.classdef_to_ref(item_type)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 135, in classdef_to_ref
    type_schema = self.classdef_to_schema(typ)
  File "/Volumes/SourceCode/llama-stack/docs/openapi_generator/pyopenapi/generator.py", line 116, in classdef_to_schema
    type_schema, type_definitions = self.schema_generator.classdef_to_schema(typ)
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 612, in classdef_to_schema
    types_defined[sub_name] = self._type_to_schema_with_lookup(sub_type)
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 569, in _type_to_schema_with_lookup
    type_schema = self.type_to_schema(data_type, force_expand=True)
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 321, in type_to_schema
    return self._type_to_schema(data_type, force_expand, json_schema_extra) | common_info
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 518, in _type_to_schema
    property_def = self.type_to_schema(property_type, json_schema_extra=json_schema_extra)
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 321, in type_to_schema
    return self._type_to_schema(data_type, force_expand, json_schema_extra) | common_info
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/schema.py", line 495, in _type_to_schema
    for property_name, property_type in get_class_properties(typ):
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/inspection.py", line 571, in get_class_properties
    resolved_hints = get_resolved_hints(typ)
  File "/Volumes/SourceCode/llama-stack/llama_stack/strong_typing/inspection.py", line 557, in get_resolved_hints
    return typing.get_type_hints(typ, include_extras=True)
  File "/Users/bbrowning/.pyenv/versions/3.10.16/lib/python3.10/typing.py", line 1833, in get_type_hints
    value = _eval_type(value, base_globals, base_locals)
  File "/Users/bbrowning/.pyenv/versions/3.10.16/lib/python3.10/typing.py", line 327, in _eval_type
    return t._evaluate(globalns, localns, recursive_guard)
  File "/Users/bbrowning/.pyenv/versions/3.10.16/lib/python3.10/typing.py", line 694, in _evaluate
    eval(self.__forward_code__, globalns, localns),
  File "<string>", line 1, in <module>
NameError: name 'ClassVar' is not defined

It's probably solvable, but something about how the OpenAI types use ClassVar isn't liked by the strong_typing code in Llama Stack.

"""Response from an OpenAI-compatible completion request.

:id: The ID of the completion
:choices: List of choices
:created: The Unix timestamp in seconds when the completion was created
:model: The model that was used to generate the completion
:object: The object type, which will be "text_completion"
"""

id: str
choices: List[OpenAICompletionChoice]
created: int
model: str
object: Literal["text_completion"] = "text_completion"


class ModelStore(Protocol):
async def get_model(self, identifier: str) -> Model: ...

Expand Down Expand Up @@ -564,3 +775,105 @@ async def embeddings(
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...

@webmethod(route="/openai/v1/completions", method="POST")
async def openai_completion(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should have this under apis/openai/ so that OpenAI related things are in one place.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's reasonable, and I went back-and-forth a bit here myself. I put the OpenAI models API endpoint under our models.py file and the OpenAI inference endpoints under our inference.py file simply because they mapped nicely to existing constructs. But, I don't have a strong preference there.

self,
# Standard OpenAI completion parameters
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
# vLLM-specific parameters
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.

:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param prompt: The prompt to generate a completion for
:param best_of: (Optional) The number of completions to generate
:param echo: (Optional) Whether to echo the prompt
:param frequency_penalty: (Optional) The penalty for repeated tokens
:param logit_bias: (Optional) The logit bias to use
:param logprobs: (Optional) The log probabilities to use
:param max_tokens: (Optional) The maximum number of tokens to generate
:param n: (Optional) The number of completions to generate
:param presence_penalty: (Optional) The penalty for repeated tokens
:param seed: (Optional) The seed to use
:param stop: (Optional) The stop tokens to use
:param stream: (Optional) Whether to stream the response
:param stream_options: (Optional) The stream options to use
:param temperature: (Optional) The temperature to use
:param top_p: (Optional) The top p to use
:param user: (Optional) The user to use
"""
...

@webmethod(route="/openai/v1/chat/completions", method="POST")
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we have define a tool type for this?

top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correctly typed for streaming?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it's not. The type doesn't cover the streaming case at all, so even though streaming works in practice with the API as-is when used from OpenAI clients the typing for streaming isn't handled yet.

"""Generate an OpenAI-compatible chat completion for the given messages using the specified model.

:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation
:param frequency_penalty: (Optional) The penalty for repeated tokens
:param function_call: (Optional) The function call to use
:param functions: (Optional) List of functions to use
:param logit_bias: (Optional) The logit bias to use
:param logprobs: (Optional) The log probabilities to use
:param max_completion_tokens: (Optional) The maximum number of tokens to generate
:param max_tokens: (Optional) The maximum number of tokens to generate
:param n: (Optional) The number of completions to generate
:param parallel_tool_calls: (Optional) Whether to parallelize tool calls
:param presence_penalty: (Optional) The penalty for repeated tokens
:param response_format: (Optional) The response format to use
:param seed: (Optional) The seed to use
:param stop: (Optional) The stop tokens to use
:param stream: (Optional) Whether to stream the response
:param stream_options: (Optional) The stream options to use
:param temperature: (Optional) The temperature to use
:param tool_choice: (Optional) The tool choice to use
:param tools: (Optional) The tools to use
:param top_logprobs: (Optional) The top log probabilities to use
:param top_p: (Optional) The top p to use
:param user: (Optional) The user to use
"""
...
23 changes: 23 additions & 0 deletions llama_stack/apis/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,35 @@ class ListModelsResponse(BaseModel):
data: List[Model]


@json_schema_type
class OpenAIModel(BaseModel):
"""A model from OpenAI.

:id: The ID of the model
:object: The object type, which will be "model"
:created: The Unix timestamp in seconds when the model was created
:owned_by: The owner of the model
"""

id: str
object: Literal["model"] = "model"
created: int
owned_by: str


class OpenAIListModelsResponse(BaseModel):
data: List[OpenAIModel]


@runtime_checkable
@trace_protocol
class Models(Protocol):
@webmethod(route="/models", method="GET")
async def list_models(self) -> ListModelsResponse: ...

@webmethod(route="/openai/v1/models", method="GET")
async def openai_list_models(self) -> OpenAIListModelsResponse: ...

@webmethod(route="/models/{model_id:path}", method="GET")
async def get_model(
self,
Expand Down
Loading