Skip to content
This repository was archived by the owner on Jun 3, 2025. It is now read-only.
Merged
7 changes: 3 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,10 @@ def _parse_requirements_file(file_path):
]
_yolov8_integration_deps = _computer_vision_deps + ["ultralytics==8.0.124"]
_transformers_integration_deps = [
"transformers<4.35",
"datasets<=2.14.6",
"transformers<4.37",
"datasets<2.16",
"accelerate<0.26",
"scikit-learn",
"fschat==0.2.33",
"accelerate==0.24.1",
"seqeval",
]
_sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps
Expand Down
55 changes: 17 additions & 38 deletions src/deepsparse/server/openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,49 +93,27 @@ async def create_chat_completion(raw_request: Request):
request = ChatCompletionRequest(**await raw_request.json())
_LOGGER.debug("Received chat completion request %s" % request)

if isinstance(request.messages, str):
prompt = request.messages
else:
# else case assums a FastChat-compliant dictionary
# Fetch a model-specific template from FastChat
_LOGGER.warning(
"A dictionary message was found. This dictionary must "
"be fastchat compliant."
)
try:
from fastchat.model.model_adapter import get_conversation_template
except ImportError as e:
return create_error_response(HTTPStatus.FAILED_DEPENDENCY, str(e))

conv = get_conversation_template(request.model)
message = request.messages
# add the model to the Conversation template, based on the given role
msg_role = message["role"]
if msg_role == "system":
conv.system_message = message["content"]
elif msg_role == "user":
conv.append_message(conv.roles[0], message["content"])
elif msg_role == "assistant":
conv.append_message(conv.roles[1], message["content"])
else:
return create_error_response(
HTTPStatus.BAD_REQUEST, "Message role not recognized"
)

# blank message to start generation
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time())
model = request.model

pipeline = app.model_to_pipeline.get(model)
if not pipeline:
return create_error_response(
HTTPStatus.BAD_REQUEST, f"{model} is not available"
HTTPStatus.BAD_REQUEST,
f"The model `{model}` does not exist.",
)

try:
prompt = pipeline.tokenizer.apply_chat_template(
conversation=request.messages,
Comment thread
mgoin marked this conversation as resolved.
Outdated
add_generation_prompt=request.add_generation_prompt,
tokenize=False,
Comment thread
mgoin marked this conversation as resolved.
Outdated
)
except Exception as e:
_LOGGER.error(f"Error in applying chat template from request: {str(e)}")
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))

request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time())

try:
sampling_params = dict(
presence_penalty=request.presence_penalty,
Expand Down Expand Up @@ -221,7 +199,8 @@ async def create_completion(raw_request: Request):
pipeline = app.model_to_pipeline.get(model)
if not pipeline:
return create_error_response(
HTTPStatus.BAD_REQUEST, f"{model} is not available"
HTTPStatus.BAD_REQUEST,
f"The model `{model}` does not exist.",
)

request_id = f"cmpl-{random_uuid()}"
Expand Down
3 changes: 2 additions & 1 deletion src/deepsparse/server/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class ChatCompletionRequest(BaseModel):
"""

model: Optional[str] = None
messages: Union[str, Dict[str, str]]
messages: Union[str, List[Dict[str, str]], Dict[str, str]]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0
n: Optional[int] = 1
Expand All @@ -123,6 +123,7 @@ class ChatCompletionRequest(BaseModel):
top_k: Optional[int] = -1
ignore_eos: Optional[bool] = False
use_beam_search: Optional[bool] = False
add_generation_prompt: Optional[bool] = True


class CompletionRequest(BaseModel):
Expand Down
13 changes: 13 additions & 0 deletions tests/server/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ def test_chat_completions_fastchat(client, model_card):
assert response.status_code == 200


def test_chat_completions_fastchat_list(client, model_card):
request = ChatCompletionRequest(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
],
max_tokens=50,
model=model_card.id,
)
response = client.post("/v1/chat/completions", json=request.dict())
assert response.status_code == 200


def test_completions(client, model_card):
request = CompletionRequest(
prompt="The Boston Bruins are ...", max_tokens=50, model=model_card.id
Expand Down