Skip to content
This repository was archived by the owner on Jun 3, 2025. It is now read-only.
Merged
2 changes: 1 addition & 1 deletion .github/workflows/test-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: "Clean sparsezoo directory"
run: rm -r sparsezoo/
- name: ⚙️ Install dependencies
run: pip install .[dev,server,image_classification,transformers,clip] opencv-python
run: pip install .[dev,server,image_classification,transformers,clip]
- name: Run base tests
run: make test
cli-smoke-tests:
Expand Down
7 changes: 3 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,10 @@ def _parse_requirements_file(file_path):
]
_yolov8_integration_deps = _computer_vision_deps + ["ultralytics==8.0.124"]
_transformers_integration_deps = [
"transformers<4.35",
"datasets<=2.14.6",
"transformers<4.37",
"datasets<2.16",
"accelerate<0.26",
"scikit-learn",
"fschat==0.2.33",
"accelerate==0.24.1",
"seqeval",
]
_sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps
Expand Down
83 changes: 44 additions & 39 deletions src/deepsparse/server/openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@
}


def apply_chatml_chat_template(messages: List[Dict[str, str]]) -> str:
# When there is no chat template available, use ChatML as the default
# https://github.com/openai/openai-python/blob/release-v0.28.1/chatml.md
prompt = ""
for message in messages:
prompt += f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n"
return prompt


class OpenAIServer(Server):
def __init__(self, **kwargs):
self.model_list = ModelList()
Expand Down Expand Up @@ -93,50 +102,45 @@ async def create_chat_completion(raw_request: Request):
request = ChatCompletionRequest(**await raw_request.json())
_LOGGER.debug("Received chat completion request %s" % request)

if isinstance(request.messages, str):
prompt = request.messages
else:
# else case assumes a FastChat-compliant dictionary
# Fetch a model-specific template from FastChat
try:
from fastchat.model.model_adapter import get_conversation_template
except ImportError as e:
return create_error_response(
HTTPStatus.FAILED_DEPENDENCY,
f"{str(e)} - Please ensure `fastchat` is installed.",
)

conv = get_conversation_template(request.model)
messages = request.messages
messages = messages if isinstance(messages, list) else [messages]
# add the model to the Conversation template, based on the given role
for message in messages:
msg_role = message["role"]
if msg_role == "system":
conv.system_message = message["content"]
elif msg_role == "user":
conv.append_message(conv.roles[0], message["content"])
elif msg_role == "assistant":
conv.append_message(conv.roles[1], message["content"])
else:
return create_error_response(
HTTPStatus.BAD_REQUEST, "Message role not recognized"
)

# blank message to start generation
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time())
model = request.model

pipeline = app.model_to_pipeline.get(model)
if not pipeline:
return create_error_response(
HTTPStatus.BAD_REQUEST, f"{model} is not available"
HTTPStatus.BAD_REQUEST,
f"The model `{model}` does not exist.",
)

messages = request.messages
# For chat templating, the message needs to be formatted
# as a list of dictionaries of `{"role": "", "content": ""}`
# https://huggingface.co/docs/transformers/chat_templating
if isinstance(messages, str):
messages = [{"role": "user", "content": messages}]
elif isinstance(messages, dict):
messages = [messages]

try:
if hasattr(pipeline.tokenizer, "apply_chat_template"):
prompt = pipeline.tokenizer.apply_chat_template(
conversation=messages,
add_generation_prompt=request.add_generation_prompt,
tokenize=False,
)
else:
# tokenizer.apply_chat_template requires Transformers>=4.34, so
# if it is not available, default to standard chatml
_LOGGER.warning(
"Cannot use tokenizer.apply_chat_template, please update to "
"transformers>=4.34 for best chat results. Defaulting to ChatML"
)
prompt = apply_chatml_chat_template(messages=messages)
except Exception as e:
_LOGGER.error(f"Error in applying chat template from request: {str(e)}")
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))

request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time())

try:
sampling_params = dict(
presence_penalty=request.presence_penalty,
Expand Down Expand Up @@ -222,7 +226,8 @@ async def create_completion(raw_request: Request):
pipeline = app.model_to_pipeline.get(model)
if not pipeline:
return create_error_response(
HTTPStatus.BAD_REQUEST, f"{model} is not available"
HTTPStatus.BAD_REQUEST,
f"The model `{model}` does not exist.",
)

request_id = f"cmpl-{random_uuid()}"
Expand Down
1 change: 1 addition & 0 deletions src/deepsparse/server/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class ChatCompletionRequest(BaseModel):
top_k: Optional[int] = -1
ignore_eos: Optional[bool] = False
use_beam_search: Optional[bool] = False
add_generation_prompt: Optional[bool] = True


class CompletionRequest(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int):
self.sequence_length = sequence_length
self.prompt_sequence_length = prompt_sequence_length

_LOGGER.warn(
_LOGGER.info(
"This operator requires the PipelineState to be set-up with the "
"onnx_input_names_no_cache attribute set from the NLEngineOperator."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, kv_cache_creator: Operator):
# instead of at the pipeline level.
self.kv_cache_creator = kv_cache_creator

_LOGGER.warn(
_LOGGER.info(
"This operator requires the PipelineState to be set-up with the "
"cache_shape, output_names, kv_cache_data_type attributes to be set "
"from the NLEngineOperator"
Expand Down
62 changes: 54 additions & 8 deletions tests/server/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,40 +89,86 @@ def test_get_models(client, model_card):
assert response.json().get("data")[0][-1] == model_card.id


def test_chat_completions(client, model_card):
def test_chat_completions_string(client, model_card):
max_tokens = 15
request = ChatCompletionRequest(
messages="How is the weather in Boston?", max_tokens=50, model=model_card.id
messages="How is the weather in Boston?",
max_tokens=max_tokens,
model=model_card.id,
)
response = client.post("/v1/chat/completions", json=request.dict())
assert response.status_code == 200

usage = response.json()["usage"]
assert usage["completion_tokens"] == max_tokens
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]


def test_chat_completions_fastchat(client, model_card):
def test_chat_completions_dict(client, model_card):
max_tokens = 15
request = ChatCompletionRequest(
messages={"role": "user", "content": "Give me banana bread recipe."},
max_tokens=50,
messages={"role": "user", "content": "How is the weather in Boston?"},
max_tokens=max_tokens,
model=model_card.id,
)
response = client.post("/v1/chat/completions", json=request.dict())
assert response.status_code == 200

usage = response.json()["usage"]
assert usage["completion_tokens"] == max_tokens
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]

def test_chat_completions_fastchat_list(client, model_card):

def test_chat_completions_list(client, model_card):
max_tokens = 15
request = ChatCompletionRequest(
messages=[{"role": "user", "content": "How is the weather in Boston?"}],
max_tokens=max_tokens,
model=model_card.id,
)
response = client.post("/v1/chat/completions", json=request.dict())
assert response.status_code == 200

usage = response.json()["usage"]
assert usage["completion_tokens"] == max_tokens
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]


def test_chat_completions_multiturn(client, model_card):
max_tokens = 20
request = ChatCompletionRequest(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Hi back!"},
{"role": "user", "content": "I like talking with you."},
],
max_tokens=50,
max_tokens=max_tokens,
model=model_card.id,
)
response = client.post("/v1/chat/completions", json=request.dict())
assert response.status_code == 200

usage = response.json()["usage"]
assert usage["completion_tokens"] == max_tokens
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]


def test_completions(client, model_card):
max_tokens = 30
request = CompletionRequest(
prompt="The Boston Bruins are ...", max_tokens=50, model=model_card.id
prompt="The Boston Bruins are ", max_tokens=max_tokens, model=model_card.id
)
response = client.post("/v1/completions", json=request.dict())
assert response.status_code == 200

usage = response.json()["usage"]
assert usage["prompt_tokens"] == 5
assert usage["completion_tokens"] == max_tokens
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]

assert (
response.json()["choices"][0]["text"]
== 'a was very happy and thanked the man. He said, "Thank you, Sara. You are a '
+ 'good friend."\n\nSara smiled and'
)