From e547610f8a148d35183a12b4af965c8e69594509 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 17:42:34 +0000 Subject: [PATCH 01/12] Let OpenAI ChatCompletionRequest accept List[Dict] messages --- src/deepsparse/server/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/server/protocol.py b/src/deepsparse/server/protocol.py index fcddb3e940..b18141be0f 100644 --- a/src/deepsparse/server/protocol.py +++ b/src/deepsparse/server/protocol.py @@ -107,7 +107,7 @@ class ChatCompletionRequest(BaseModel): """ model: Optional[str] = None - messages: Union[str, Dict[str, str]] + messages: Union[str, List[Dict[str, str]], Dict[str, str]] temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 n: Optional[int] = 1 From 83a772c0622e4faa3c59eb8750cc4a2b0a1e30a0 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 17:45:19 +0000 Subject: [PATCH 02/12] Test --- tests/server/test_openai.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/server/test_openai.py b/tests/server/test_openai.py index d479ef59e0..09803d9672 100644 --- a/tests/server/test_openai.py +++ b/tests/server/test_openai.py @@ -107,6 +107,19 @@ def test_chat_completions_fastchat(client, model_card): assert response.status_code == 200 +def test_chat_completions_fastchat_list(client, model_card): + request = ChatCompletionRequest( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ], + max_tokens=50, + model=model_card.id, + ) + response = client.post("/v1/chat/completions", json=request.dict()) + assert response.status_code == 200 + + def test_completions(client, model_card): request = CompletionRequest( prompt="The Boston Bruins are ...", max_tokens=50, model=model_card.id From 0414a5a5eeacc86f62fcf43bdafb64a01b6abbd8 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 17:58:35 +0000 Subject: [PATCH 03/12] Fix list parsing when building template --- src/deepsparse/server/openai_server.py | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index 5928c58f81..a2e671ef9b 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -108,19 +108,21 @@ async def create_chat_completion(raw_request: Request): return create_error_response(HTTPStatus.FAILED_DEPENDENCY, str(e)) conv = get_conversation_template(request.model) - message = request.messages - # add the model to the Conversation template, based on the given role - msg_role = message["role"] - if msg_role == "system": - conv.system_message = message["content"] - elif msg_role == "user": - conv.append_message(conv.roles[0], message["content"]) - elif msg_role == "assistant": - conv.append_message(conv.roles[1], message["content"]) - else: - return create_error_response( - HTTPStatus.BAD_REQUEST, "Message role not recognized" - ) + messages = request.messages + messages = messages if isinstance(messages, list) else [messages] + for message in messages: + # add the model to the Conversation template, based on the given role + msg_role = message["role"] + if msg_role == "system": + conv.system_message = message["content"] + elif msg_role == "user": + conv.append_message(conv.roles[0], message["content"]) + elif msg_role == "assistant": + conv.append_message(conv.roles[1], message["content"]) + else: + return create_error_response( + HTTPStatus.BAD_REQUEST, "Message role not recognized" + ) # blank message to start generation conv.append_message(conv.roles[1], None) From 646a640ce60b41d728e9df559e0d5f6ace6bd361 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 18:15:54 +0000 Subject: [PATCH 04/12] More error messages --- src/deepsparse/server/openai_server.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index a2e671ef9b..e4003ed3d9 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -96,22 +96,21 @@ async def create_chat_completion(raw_request: Request): if isinstance(request.messages, str): prompt = request.messages else: - # else case assums a FastChat-compliant dictionary + # else case assumes a FastChat-compliant dictionary # Fetch a model-specific template from FastChat - _LOGGER.warning( - "A dictionary message was found. This dictionary must " - "be fastchat compliant." - ) try: from fastchat.model.model_adapter import get_conversation_template except ImportError as e: - return create_error_response(HTTPStatus.FAILED_DEPENDENCY, str(e)) + return create_error_response( + HTTPStatus.FAILED_DEPENDENCY, + f"{str(e)} - Please ensure `fastchat` is installed.", + ) conv = get_conversation_template(request.model) messages = request.messages messages = messages if isinstance(messages, list) else [messages] + # add the model to the Conversation template, based on the given role for message in messages: - # add the model to the Conversation template, based on the given role msg_role = message["role"] if msg_role == "system": conv.system_message = message["content"] From 7581e25278da7e4ff6e3a19cb5bf66e033ef3f39 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 19:01:54 +0000 Subject: [PATCH 05/12] Replace Fastchat Chat Templates with HF --- setup.py | 5 +-- src/deepsparse/server/openai_server.py | 56 ++++++++------------------ src/deepsparse/server/protocol.py | 1 + 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/setup.py b/setup.py index abaae06a33..2a302f1aa9 100644 --- a/setup.py +++ b/setup.py @@ -145,10 +145,9 @@ def _parse_requirements_file(file_path): _yolov8_integration_deps = _computer_vision_deps + ["ultralytics==8.0.124"] _transformers_integration_deps = [ "transformers<4.35", - "datasets<=2.14.6", + "datasets<2.15", "scikit-learn", - "fschat==0.2.33", - "accelerate==0.24.1", + "accelerate<0.25", "seqeval", ] _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index e4003ed3d9..b9e1cfa760 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -93,49 +93,26 @@ async def create_chat_completion(raw_request: Request): request = ChatCompletionRequest(**await raw_request.json()) _LOGGER.debug("Received chat completion request %s" % request) - if isinstance(request.messages, str): - prompt = request.messages - else: - # else case assumes a FastChat-compliant dictionary - # Fetch a model-specific template from FastChat - try: - from fastchat.model.model_adapter import get_conversation_template - except ImportError as e: - return create_error_response( - HTTPStatus.FAILED_DEPENDENCY, - f"{str(e)} - Please ensure `fastchat` is installed.", - ) - - conv = get_conversation_template(request.model) - messages = request.messages - messages = messages if isinstance(messages, list) else [messages] - # add the model to the Conversation template, based on the given role - for message in messages: - msg_role = message["role"] - if msg_role == "system": - conv.system_message = message["content"] - elif msg_role == "user": - conv.append_message(conv.roles[0], message["content"]) - elif msg_role == "assistant": - conv.append_message(conv.roles[1], message["content"]) - else: - return create_error_response( - HTTPStatus.BAD_REQUEST, "Message role not recognized" - ) - - # blank message to start generation - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - request_id = f"cmpl-{random_uuid()}" - created_time = int(time.time()) model = request.model - pipeline = app.model_to_pipeline.get(model) if not pipeline: return create_error_response( - HTTPStatus.BAD_REQUEST, f"{model} is not available" + HTTPStatus.BAD_REQUEST, + f"The model `{model}` does not exist.", + ) + + try: + prompt = pipeline.tokenizer.apply_chat_template( + conversation=request.messages, + add_generation_prompt=request.add_generation_prompt, + tokenize=False, ) + except Exception as e: + _LOGGER.error(f"Error in applying chat template from request: {str(e)}") + return create_error_response(HTTPStatus.BAD_REQUEST, str(e)) + + request_id = f"cmpl-{random_uuid()}" + created_time = int(time.time()) try: sampling_params = dict( @@ -222,7 +199,8 @@ async def create_completion(raw_request: Request): pipeline = app.model_to_pipeline.get(model) if not pipeline: return create_error_response( - HTTPStatus.BAD_REQUEST, f"{model} is not available" + HTTPStatus.BAD_REQUEST, + f"The model `{model}` does not exist.", ) request_id = f"cmpl-{random_uuid()}" diff --git a/src/deepsparse/server/protocol.py b/src/deepsparse/server/protocol.py index b18141be0f..8d827ebd91 100644 --- a/src/deepsparse/server/protocol.py +++ b/src/deepsparse/server/protocol.py @@ -123,6 +123,7 @@ class ChatCompletionRequest(BaseModel): top_k: Optional[int] = -1 ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + add_generation_prompt: Optional[bool] = True class CompletionRequest(BaseModel): From 56dc6db1abe644441c33cd1cbc315746be5dbf64 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 21 Dec 2023 14:24:46 -0500 Subject: [PATCH 06/12] Update setup.py --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 2a302f1aa9..aece6c0ab0 100644 --- a/setup.py +++ b/setup.py @@ -144,10 +144,10 @@ def _parse_requirements_file(file_path): ] _yolov8_integration_deps = _computer_vision_deps + ["ultralytics==8.0.124"] _transformers_integration_deps = [ - "transformers<4.35", - "datasets<2.15", + "transformers<4.37", + "datasets<2.16", + "accelerate<0.26", "scikit-learn", - "accelerate<0.25", "seqeval", ] _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps From 34ad42bcc80af4c4fa66912f614a18d377ba230d Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 21 Dec 2023 20:26:56 +0000 Subject: [PATCH 07/12] Fix non-lists and add testing --- src/deepsparse/server/openai_server.py | 11 +++- tests/server/test_openai.py | 69 +++++++++++++++++++++++--- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index b9e1cfa760..e3a455fbdb 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -102,8 +102,17 @@ async def create_chat_completion(raw_request: Request): ) try: + messages = request.messages + # For chat templating, the message needs to be formatted + # as a list of dictionaries of `{"role": "", "content": ""}` + # https://huggingface.co/docs/transformers/chat_templating + if isinstance(messages, str): + messages = [{"role": "user", "content": messages}] + elif isinstance(messages, dict): + messages = [messages] + prompt = pipeline.tokenizer.apply_chat_template( - conversation=request.messages, + conversation=messages, add_generation_prompt=request.add_generation_prompt, tokenize=False, ) diff --git a/tests/server/test_openai.py b/tests/server/test_openai.py index 09803d9672..5f3ac2220b 100644 --- a/tests/server/test_openai.py +++ b/tests/server/test_openai.py @@ -89,40 +89,93 @@ def test_get_models(client, model_card): assert response.json().get("data")[0][-1] == model_card.id -def test_chat_completions(client, model_card): +def test_chat_completions_string(client, model_card): + max_tokens = 15 request = ChatCompletionRequest( - messages="How is the weather in Boston?", max_tokens=50, model=model_card.id + messages="How is the weather in Boston?", + max_tokens=max_tokens, + model=model_card.id, + ) + response = client.post("/v1/chat/completions", json=request.dict()) + assert response.status_code == 200 + + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 8 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + message = response.json()["choices"][0]["message"] + assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" + + +def test_chat_completions_dict(client, model_card): + max_tokens = 15 + request = ChatCompletionRequest( + messages={"role": "user", "content": "How is the weather in Boston?"}, + max_tokens=max_tokens, + model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 8 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + message = response.json()["choices"][0]["message"] + assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" + -def test_chat_completions_fastchat(client, model_card): +def test_chat_completions_list(client, model_card): + max_tokens = 15 request = ChatCompletionRequest( - messages={"role": "user", "content": "Give me banana bread recipe."}, - max_tokens=50, + messages=[{"role": "user", "content": "How is the weather in Boston?"}], + max_tokens=max_tokens, model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 8 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] -def test_chat_completions_fastchat_list(client, model_card): + message = response.json()["choices"][0]["message"] + assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" + + +def test_chat_completions_multiturn(client, model_card): + max_tokens = 20 request = ChatCompletionRequest( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi back!"}, + {"role": "user", "content": "I like talking with you."}, ], - max_tokens=50, + max_tokens=max_tokens, model=model_card.id, ) response = client.post("/v1/chat/completions", json=request.dict()) assert response.status_code == 200 + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 21 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + def test_completions(client, model_card): + max_tokens = 30 request = CompletionRequest( - prompt="The Boston Bruins are ...", max_tokens=50, model=model_card.id + prompt="The Boston Bruins are ", max_tokens=max_tokens, model=model_card.id ) response = client.post("/v1/completions", json=request.dict()) assert response.status_code == 200 + + usage = response.json()["usage"] + assert usage["prompt_tokens"] == 5 + assert usage["completion_tokens"] == max_tokens + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] From 1e72392fa327eb71bf7db7d6ecef0f63e7b79efb Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 26 Dec 2023 16:37:10 +0000 Subject: [PATCH 08/12] Cleanup warn->info --- .../text_generation/autoregressive_preprocess_operator.py | 2 +- .../transformers/pipelines/text_generation/prep_for_prefill.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py index 9fb17f3946..a2ee2dcea3 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py @@ -35,7 +35,7 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int): self.sequence_length = sequence_length self.prompt_sequence_length = prompt_sequence_length - _LOGGER.warn( + _LOGGER.info( "This operator requires the PipelineState to be set-up with the " "onnx_input_names_no_cache attribute set from the NLEngineOperator." ) diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py index 47b4965daf..b15903ac14 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_prefill.py @@ -36,7 +36,7 @@ def __init__(self, kv_cache_creator: Operator): # instead of at the pipeline level. self.kv_cache_creator = kv_cache_creator - _LOGGER.warn( + _LOGGER.info( "This operator requires the PipelineState to be set-up with the " "cache_shape, output_names, kv_cache_data_type attributes to be set " "from the NLEngineOperator" From bfa3e5d09beb5b1b6c96982351c78157560ace50 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 26 Dec 2023 14:41:42 -0500 Subject: [PATCH 09/12] Update test-check.yaml --- .github/workflows/test-check.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 18d71fd0ac..6fcd31d5a1 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -40,6 +40,8 @@ jobs: repository: "neuralmagic/sparsezoo" path: "sparsezoo" ref: ${{needs.test-setup.outputs.branch}} + - name: "Update pip" + run: pip install -U pip - name: "⚙️ Install sparsezoo dependencies" run: pip install sparsezoo/ - name: "Clean sparsezoo directory" From d303e77d3153f7e077ab9e2638cfe39878b01494 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 27 Dec 2023 07:10:49 -0800 Subject: [PATCH 10/12] Update test deps --- .github/workflows/test-check.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 6fcd31d5a1..1564ea7990 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -40,14 +40,12 @@ jobs: repository: "neuralmagic/sparsezoo" path: "sparsezoo" ref: ${{needs.test-setup.outputs.branch}} - - name: "Update pip" - run: pip install -U pip - name: "⚙️ Install sparsezoo dependencies" run: pip install sparsezoo/ - name: "Clean sparsezoo directory" run: rm -r sparsezoo/ - name: ⚙️ Install dependencies - run: pip install .[dev,server,image_classification,transformers,clip] opencv-python + run: pip install .[dev,server,image_classification,transformers] - name: Run base tests run: make test cli-smoke-tests: From 8e081f9d7fc31c6101dbbad7d32e32ce154bf4b6 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 27 Dec 2023 07:18:38 -0800 Subject: [PATCH 11/12] Update test deps --- .github/workflows/test-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 1564ea7990..91068101db 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -45,7 +45,7 @@ jobs: - name: "Clean sparsezoo directory" run: rm -r sparsezoo/ - name: ⚙️ Install dependencies - run: pip install .[dev,server,image_classification,transformers] + run: pip install .[dev,server,image_classification,transformers,clip] - name: Run base tests run: make test cli-smoke-tests: From 3ef9959df3bef32f88e15a40dad8a13f61ba3fa9 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 27 Dec 2023 17:28:23 +0000 Subject: [PATCH 12/12] Add fallback for chat template as ChatML --- src/deepsparse/server/openai_server.py | 46 ++++++++++++++++++-------- tests/server/test_openai.py | 32 ++++-------------- 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index e3a455fbdb..dab4650a74 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -57,6 +57,15 @@ } +def apply_chatml_chat_template(messages: List[Dict[str, str]]) -> str: + # When there is no chat template available, use ChatML as the default + # https://github.com/openai/openai-python/blob/release-v0.28.1/chatml.md + prompt = "" + for message in messages: + prompt += f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n" + return prompt + + class OpenAIServer(Server): def __init__(self, **kwargs): self.model_list = ModelList() @@ -101,21 +110,30 @@ async def create_chat_completion(raw_request: Request): f"The model `{model}` does not exist.", ) + messages = request.messages + # For chat templating, the message needs to be formatted + # as a list of dictionaries of `{"role": "", "content": ""}` + # https://huggingface.co/docs/transformers/chat_templating + if isinstance(messages, str): + messages = [{"role": "user", "content": messages}] + elif isinstance(messages, dict): + messages = [messages] + try: - messages = request.messages - # For chat templating, the message needs to be formatted - # as a list of dictionaries of `{"role": "", "content": ""}` - # https://huggingface.co/docs/transformers/chat_templating - if isinstance(messages, str): - messages = [{"role": "user", "content": messages}] - elif isinstance(messages, dict): - messages = [messages] - - prompt = pipeline.tokenizer.apply_chat_template( - conversation=messages, - add_generation_prompt=request.add_generation_prompt, - tokenize=False, - ) + if hasattr(pipeline.tokenizer, "apply_chat_template"): + prompt = pipeline.tokenizer.apply_chat_template( + conversation=messages, + add_generation_prompt=request.add_generation_prompt, + tokenize=False, + ) + else: + # tokenizer.apply_chat_template requires Transformers>=4.34, so + # if it is not available, default to standard chatml + _LOGGER.warning( + "Cannot use tokenizer.apply_chat_template, please update to " + "transformers>=4.34 for best chat results. Defaulting to ChatML" + ) + prompt = apply_chatml_chat_template(messages=messages) except Exception as e: _LOGGER.error(f"Error in applying chat template from request: {str(e)}") return create_error_response(HTTPStatus.BAD_REQUEST, str(e)) diff --git a/tests/server/test_openai.py b/tests/server/test_openai.py index 9a40ce203a..9d023c2331 100644 --- a/tests/server/test_openai.py +++ b/tests/server/test_openai.py @@ -100,13 +100,9 @@ def test_chat_completions_string(client, model_card): assert response.status_code == 200 usage = response.json()["usage"] - assert usage["prompt_tokens"] == 8 assert usage["completion_tokens"] == max_tokens assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] - message = response.json()["choices"][0]["message"] - assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" - def test_chat_completions_dict(client, model_card): max_tokens = 15 @@ -119,13 +115,9 @@ def test_chat_completions_dict(client, model_card): assert response.status_code == 200 usage = response.json()["usage"] - assert usage["prompt_tokens"] == 8 assert usage["completion_tokens"] == max_tokens assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] - message = response.json()["choices"][0]["message"] - assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" - def test_chat_completions_list(client, model_card): max_tokens = 15 @@ -138,13 +130,9 @@ def test_chat_completions_list(client, model_card): assert response.status_code == 200 usage = response.json()["usage"] - assert usage["prompt_tokens"] == 8 assert usage["completion_tokens"] == max_tokens assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] - message = response.json()["choices"][0]["message"] - assert message["content"] == "\n\n\nPossible story:\n\nLily and Ben were playing in" - def test_chat_completions_multiturn(client, model_card): max_tokens = 20 @@ -162,24 +150,10 @@ def test_chat_completions_multiturn(client, model_card): assert response.status_code == 200 usage = response.json()["usage"] - assert usage["prompt_tokens"] == 21 assert usage["completion_tokens"] == max_tokens assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] -def test_chat_completions_fastchat_list(client, model_card): - request = ChatCompletionRequest( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello!"}, - ], - max_tokens=50, - model=model_card.id, - ) - response = client.post("/v1/chat/completions", json=request.dict()) - assert response.status_code == 200 - - def test_completions(client, model_card): max_tokens = 30 request = CompletionRequest( @@ -192,3 +166,9 @@ def test_completions(client, model_card): assert usage["prompt_tokens"] == 5 assert usage["completion_tokens"] == max_tokens assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + assert ( + response.json()["choices"][0]["text"] + == 'a was very happy and thanked the man. He said, "Thank you, Sara. You are a ' + + 'good friend."\n\nSara smiled and' + )