Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix broken AsyncInferenceClient on [DONE] signal #2458

Merged
merged 3 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/huggingface_hub/inference/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def _format_text_generation_stream_output(
if not byte_payload.startswith(b"data:"):
return None # empty line

if byte_payload == b"data: [DONE]":
if byte_payload.strip() == b"data: [DONE]":
raise StopIteration("[DONE] signal received.")

# Decode payload
Expand Down Expand Up @@ -344,7 +344,7 @@ def _format_chat_completion_stream_output(
if not byte_payload.startswith(b"data:"):
return None # empty line

if byte_payload == b"data: [DONE]":
if byte_payload.strip() == b"data: [DONE]":
raise StopIteration("[DONE] signal received.")

# Decode payload
Expand All @@ -355,7 +355,7 @@ def _format_chat_completion_stream_output(

async def _async_yield_from(client: "ClientSession", response: "ClientResponse") -> AsyncIterable[bytes]:
async for byte_payload in response.content:
yield byte_payload
yield byte_payload.strip()
Copy link

@cordawyn cordawyn Aug 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Wauplin Btw, the two other byte_payload.rstrip() (above) may not be necessary, as _format_text_generation_stream_output and _format_chat_completion_stream_output receive "[DONE]" w/o "\n" from the regular client (which uses .iter_lines and strips newlines naturally), while the byte_payload for the async client passes through _async_yield_from here first.

Maybe just a micro optimization, in case performance ever becomes an issue here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I saw that but better to be safer. Each method taken individually should be able to work with unstripped data and return stripped one. Usually inference time is much higher than the parsing overhead.

await client.close()


Expand Down
53 changes: 22 additions & 31 deletions tests/cassettes/test_async_chat_completion_with_stream.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,80 +3,71 @@ interactions:
body: null
headers:
user-agent:
- unknown/None; hf_hub/0.22.0.dev0; python/3.10.12; torch/2.2.0; tensorflow/2.15.0.post1;
- unknown/None; hf_hub/0.25.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.17.0;
fastcore/1.5.23
method: POST
uri: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta/v1/chat/completions
response:
body:
string: 'data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"Deep"},"logprobs":null,"finish_reason":null}]}
string: 'data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Deep"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
Learning"},"logprobs":null,"finish_reason":null}]}
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
learning"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
is"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
a"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
sub"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"field"},"logprobs":null,"finish_reason":null}]}
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"field"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
of"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
Machine"},"logprobs":null,"finish_reason":null}]}
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
machine"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
Learning"},"logprobs":null,"finish_reason":null}]}
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
learning"},"logprobs":null,"finish_reason":null}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"
that"},"logprobs":null,"finish_reason":null}]}
data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
that"},"logprobs":null,"finish_reason":"length"}]}


data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"length"}]}
'
headers:
Access-Control-Allow-Credentials:
- 'true'
Access-Control-Allow-Origin:
- '*'
Cache-Control:
- no-cache
Connection:
- keep-alive
Content-Length:
- '2526'
Content-Type:
- text/event-stream
Date:
- Thu, 14 Mar 2024 18:05:08 GMT
Transfer-Encoding:
- chunked
- Mon, 19 Aug 2024 13:01:29 GMT
Vary:
- origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-accel-buffering:
- 'no'
x-compute-characters:
- '103'
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-type:
- 1-a10-g
- cache
x-request-id:
- idvh81inTm9FBUT-za5t7
- w9oS4KSPCoEAOi6QV7-cX
x-sha:
- b70e0c9a2d9e14bd1e812d3c398e5f313e93b473
status:
code: 200
message: OK
url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta/v1/chat/completions
version: 1
8 changes: 3 additions & 5 deletions tests/test_inference_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,18 +212,16 @@ async def test_async_chat_completion_with_stream() -> None:

all_items = [item async for item in output]
generated_text = ""
for item in all_items[:-1]: # all but last item
for item in all_items:
assert isinstance(item, ChatCompletionStreamOutput)
assert len(item.choices) == 1
generated_text += item.choices[0].delta.content
last_item = all_items[-1]

assert generated_text == "Deep Learning is a subfield of Machine Learning that"
assert generated_text == "Deep learning is a subfield of machine learning that"

# Last item has a finish reason but no role/content delta
# Last item has a finish reason
assert last_item.choices[0].finish_reason == "length"
assert last_item.choices[0].delta.role is None
assert last_item.choices[0].delta.content is None


@pytest.mark.vcr
Expand Down
10 changes: 6 additions & 4 deletions tests/test_inference_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,13 +937,14 @@ def test_chat_completion_base_url_works_with_v1(base_url: str):
assert post_mock.call_args_list[0].kwargs["model"] == "http://0.0.0.0:8080/v1/chat/completions"


def test_stream_text_generation_response():
@pytest.mark.parametrize("stop_signal", [b"data: [DONE]", b"data: [DONE]\n", b"data: [DONE] "])
def test_stream_text_generation_response(stop_signal: bytes):
data = [
b'data: {"index":1,"token":{"id":4560,"text":" trying","logprob":-2.078125,"special":false},"generated_text":null,"details":null}',
b"", # Empty line is skipped
b"\n", # Newline is skipped
b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}',
b"data: [DONE]", # Stop signal
stop_signal, # Stop signal
# Won't parse after
b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}',
]
Expand All @@ -952,13 +953,14 @@ def test_stream_text_generation_response():
assert output == [" trying", " to"]


def test_stream_chat_completion_response():
@pytest.mark.parametrize("stop_signal", [b"data: [DONE]", b"data: [DONE]\n", b"data: [DONE] "])
def test_stream_chat_completion_response(stop_signal: bytes):
data = [
b'data: {"object":"chat.completion.chunk","id":"","created":1721737661,"model":"","system_fingerprint":"2.1.2-dev0-sha-5fca30e","choices":[{"index":0,"delta":{"role":"assistant","content":"Both"},"logprobs":null,"finish_reason":null}]}',
b"", # Empty line is skipped
b"\n", # Newline is skipped
b'data: {"object":"chat.completion.chunk","id":"","created":1721737661,"model":"","system_fingerprint":"2.1.2-dev0-sha-5fca30e","choices":[{"index":0,"delta":{"role":"assistant","content":" Rust"},"logprobs":null,"finish_reason":null}]}',
b"data: [DONE]", # Stop signal
stop_signal, # Stop signal
# Won't parse after
b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}',
]
Expand Down
Loading