diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 18d71fd0ac..91068101db 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -45,7 +45,7 @@ jobs: - name: "Clean sparsezoo directory" run: rm -r sparsezoo/ - name: ⚙️ Install dependencies - run: pip install .[dev,server,image_classification,transformers,clip] opencv-python + run: pip install .[dev,server,image_classification,transformers,clip] - name: Run base tests run: make test cli-smoke-tests: diff --git a/setup.py b/setup.py index abaae06a33..aece6c0ab0 100644 --- a/setup.py +++ b/setup.py @@ -144,11 +144,10 @@ def _parse_requirements_file(file_path): ] _yolov8_integration_deps = _computer_vision_deps + ["ultralytics==8.0.124"] _transformers_integration_deps = [ - "transformers<4.35", - "datasets<=2.14.6", + "transformers<4.37", + "datasets<2.16", + "accelerate<0.26", "scikit-learn", - "fschat==0.2.33", - "accelerate==0.24.1", "seqeval", ] _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index e4003ed3d9..dab4650a74 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -57,6 +57,15 @@ } +def apply_chatml_chat_template(messages: List[Dict[str, str]]) -> str: + # When there is no chat template available, use ChatML as the default + # https://github.com/openai/openai-python/blob/release-v0.28.1/chatml.md + prompt = "" + for message in messages: + prompt += f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n" + return prompt + + class OpenAIServer(Server): def __init__(self, **kwargs): self.model_list = ModelList() @@ -93,50 +102,45 @@ async def create_chat_completion(raw_request: Request): request = ChatCompletionRequest(**await raw_request.json()) _LOGGER.debug("Received chat completion request %s" % request) - if isinstance(request.messages, str): - prompt = request.messages - else: - # else case assumes a FastChat-compliant dictionary - # Fetch a model-specific template from FastChat - try: - from fastchat.model.model_adapter import get_conversation_template - except ImportError as e: - return create_error_response( - HTTPStatus.FAILED_DEPENDENCY, - f"{str(e)} - Please ensure `fastchat` is installed.", - ) - - conv = get_conversation_template(request.model) - messages = request.messages - messages = messages if isinstance(messages, list) else [messages] - # add the model to the Conversation template, based on the given role - for message in messages: - msg_role = message["role"] - if msg_role == "system": - conv.system_message = message["content"] - elif msg_role == "user": - conv.append_message(conv.roles[0], message["content"]) - elif msg_role == "assistant": - conv.append_message(conv.roles[1], message["content"]) - else: - return create_error_response( - HTTPStatus.BAD_REQUEST, "Message role not recognized" - ) - - # blank message to start generation - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - request_id = f"cmpl-{random_uuid()}" - created_time = int(time.time()) model = request.model - pipeline = app.model_to_pipeline.get(model) if not pipeline: return create_error_response( - HTTPStatus.BAD_REQUEST, f"{model} is not available" + HTTPStatus.BAD_REQUEST, + f"The model `{model}` does not exist.", ) + messages = request.messages + # For chat templating, the message needs to be formatted + # as a list of dictionaries of `{"role": "", "content": ""}` + # https://huggingface.co/docs/transformers/chat_templating + if isinstance(messages, str): + messages = [{"role": "user", "content": messages}] + elif isinstance(messages, dict): + messages = [messages] + + try: + if hasattr(pipeline.tokenizer, "apply_chat_template"): + prompt = pipeline.tokenizer.apply_chat_template( + conversation=messages, + add_generation_prompt=request.add_generation_prompt, + tokenize=False, + ) + else: + # tokenizer.apply_chat_template requires Transformers>=4.34, so + # if it is not available, default to standard chatml + _LOGGER.warning( + "Cannot use tokenizer.apply_chat_template, please update to " + "transformers>=4.34 for best chat results. Defaulting to ChatML" + ) + prompt = apply_chatml_chat_template(messages=messages) + except Exception as e: + _LOGGER.error(f"Error in applying chat template from request: {str(e)}") + return create_error_response(HTTPStatus.BAD_REQUEST, str(e)) + + request_id = f"cmpl-{random_uuid()}" + created_time = int(time.time()) + try: sampling_params = dict( presence_penalty=request.presence_penalty, @@ -222,7 +226,8 @@ async def create_completion(raw_request: Request): pipeline = app.model_to_pipeline.get(model) if not pipeline: return create_error_response( - HTTPStatus.BAD_REQUEST, f"{model} is not available" + HTTPStatus.BAD_REQUEST, + f"The model `{model}` does not exist.", ) request_id = f"cmpl-{random_uuid()}" diff --git a/src/deepsparse/server/protocol.py b/src/deepsparse/server/protocol.py index b18141be0f..8d827ebd91 100644 --- a/src/deepsparse/server/protocol.py +++ b/src/deepsparse/server/protocol.py @@ -123,6 +123,7 @@ class ChatCompletionRequest(BaseModel): top_k: Optional[int] = -1 ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + add_generation_prompt: Optional[bool] = True class CompletionRequest(BaseModel): diff --git a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py index 9fb17f3946..a2ee2dcea3 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py @@ -35,7 +35,7 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int): self.sequence_length = sequence_length self.prompt_sequence_length = prompt_sequence_length - _LOGGER.warn( + _LOGGER.info( "This operator requires the PipelineState to be set-up with the " "onnx_input_names_no_cache attribute set from the NLEngineOperator." ) diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py index 47b4965daf..b15903ac14 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py @@ -36,7 +36,7 @@ def __init__(self, kv_cache_creator: Operator): # instead of at the pipeline level. self.kv_cache_creator = kv_cache_creator - _LOGGER.warn( + _LOGGER.info( "This operator requires the PipelineState to be set-up with the " "cache_shape, output_names, kv_cache_data_type attributes to be set " "from the NLEngineOperator" diff --git a/tests/server/test_openai.py b/tests/server/test_openai.py index 09803d9672..9d023c2331 100644 --- a/tests/server/test_openai.py +++ b/tests/server/test_openai.py @@ -89,40 +89,86 @@ def test_get_models(client, model_card): assert response.json().get("data")[0][-1] == model_card.id -def test_chat_completions(client, model_card): +def test_chat_completions_string(client, model_card): + max_tokens = 15 request = ChatCompletionRequest( - messages="How is the weather in Boston?", max_tokens=50, model=model_card.id + messages="How is the weather in Boston?", + max_tokens=max_tokens, + model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + -def test_chat_completions_fastchat(client, model_card): +def test_chat_completions_dict(client, model_card): + max_tokens = 15 request = ChatCompletionRequest( - messages={"role": "user", "content": "Give me banana bread recipe."}, - max_tokens=50, + messages={"role": "user", "content": "How is the weather in Boston?"}, + max_tokens=max_tokens, model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] -def test_chat_completions_fastchat_list(client, model_card): + +def test_chat_completions_list(client, model_card): + max_tokens = 15 + request = ChatCompletionRequest( + messages=[{"role": "user", "content": "How is the weather in Boston?"}], + max_tokens=max_tokens, + model=model_card.id, + ) + response = client.post("/v1/chat/completions", json=request.dict()) + assert response.status_code == 200 + + usage = response.json()["usage"] + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + +def test_chat_completions_multiturn(client, model_card): + max_tokens = 20 request = ChatCompletionRequest( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi back!"}, + {"role": "user", "content": "I like talking with you."}, ], - max_tokens=50, + max_tokens=max_tokens, model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + def test_completions(client, model_card): + max_tokens = 30 request = CompletionRequest( - prompt="The Boston Bruins are ...", max_tokens=50, model=model_card.id + prompt="The Boston Bruins are ", max_tokens=max_tokens, model=model_card.id ) response = client.post("/v1/completions", json=request.dict()) assert response.status_code == 200 + + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 5 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + assert ( + response.json()["choices"][0]["text"] + == 'a was very happy and thanked the man. He said, "Thank you, Sara. You are a ' + + 'good friend."\n\nSara smiled and' + )