diff --git a/.gitignore b/.gitignore index 844779e..ad533e0 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,6 @@ build/ __pypackages__/ .pytest_cache frontend/tsconfig.tsbuildinfo -ratls/ - # Claude .claude/settings.local.json CLAUDE.local.md diff --git a/cvm/docker-compose.yml b/cvm/docker-compose.yml index c9d369b..20901db 100644 --- a/cvm/docker-compose.yml +++ b/cvm/docker-compose.yml @@ -1,6 +1,6 @@ services: vllm: - image: vllm/vllm-openai:v0.13.0 + image: ghcr.io/concrete-security/vllm-openai:v0.13.0-harmony-fix container_name: vllm environment: - NVIDIA_VISIBLE_DEVICES=all diff --git a/cvm/vllm-patch/Dockerfile b/cvm/vllm-patch/Dockerfile new file mode 100644 index 0000000..81cda9b --- /dev/null +++ b/cvm/vllm-patch/Dockerfile @@ -0,0 +1,6 @@ +FROM vllm/vllm-openai:v0.13.0 + +COPY harmony-streaming-tool-call-fallback.patch /tmp/ +RUN SITE=$(python3 -c "import site; print(site.getsitepackages()[0])") \ + && patch -p1 -d "$SITE" < /tmp/harmony-streaming-tool-call-fallback.patch \ + && rm /tmp/harmony-streaming-tool-call-fallback.patch diff --git a/cvm/vllm-patch/harmony-streaming-tool-call-fallback.patch b/cvm/vllm-patch/harmony-streaming-tool-call-fallback.patch new file mode 100644 index 0000000..17396ff --- /dev/null +++ b/cvm/vllm-patch/harmony-streaming-tool-call-fallback.patch @@ -0,0 +1,99 @@ +--- a/vllm/entrypoints/openai/serving_chat.py ++++ b/vllm/entrypoints/openai/serving_chat.py +@@ -547,6 +547,7 @@ + get_streamable_parser_for_assistant() for _ in range(num_choices) + ] + harmony_tools_streamed = [False] * num_choices ++ harmony_token_ids = [[] for _ in range(num_choices)] + tools_streamed = [False] * num_choices + + if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): +@@ -739,6 +740,7 @@ + delta_text = "" + for token_id in output.token_ids: + harmony_parser.process(token_id) ++ harmony_token_ids[i].append(token_id) + delta_text += harmony_parser.last_content_delta or "" + cur_channel = harmony_parser.current_channel + cur_recipient = harmony_parser.current_recipient +@@ -1194,6 +1196,33 @@ + ] + ) + ++ # Harmony can occasionally reach finish with no ++ # streamed tool delta even though the full token ++ # stream contains a valid function call. Recover ++ # by re-parsing the accumulated harmony tokens once ++ # at finish. ++ if self.use_harmony and not harmony_tools_streamed[i]: ++ recovered_tool_calls = ( ++ self._extract_harmony_tool_calls_streaming_fallback( ++ request=request, ++ tokenizer=tokenizer, ++ token_ids=harmony_token_ids[i], ++ ) ++ ) ++ if recovered_tool_calls: ++ delta_message = DeltaMessage( ++ content=( ++ delta_message.content ++ if delta_message else None ++ ), ++ reasoning=( ++ delta_message.reasoning ++ if delta_message else None ++ ), ++ tool_calls=recovered_tool_calls, ++ ) ++ harmony_tools_streamed[i] = True ++ + # Send the finish response for each request.n only once + # In OpenAI's API, when a tool is called, the + # finish_reason is: +@@ -1756,6 +1785,46 @@ + and request.tool_choice in ["auto", None] + ) + ++ def _extract_harmony_tool_calls_streaming_fallback( ++ self, ++ request, ++ tokenizer, ++ token_ids, ++ ) -> list: ++ """Recover tool calls from accumulated harmony tokens at finish.""" ++ if not request.tools or self.tool_parser is None: ++ return [] ++ ++ try: ++ tool_parser = self.tool_parser(tokenizer) ++ tool_call_info = tool_parser.extract_tool_calls( ++ "", ++ request=request, ++ token_ids=token_ids, ++ ) ++ except Exception: ++ import logging ++ logging.getLogger(__name__).exception( ++ "Error extracting harmony tool calls at finish." ++ ) ++ return [] ++ ++ if not tool_call_info.tools_called: ++ return [] ++ ++ return [ ++ DeltaToolCall( ++ id=tool_call.id, ++ type=tool_call.type, ++ index=index, ++ function=DeltaFunctionCall( ++ name=tool_call.function.name, ++ arguments=tool_call.function.arguments, ++ ), ++ ) ++ for index, tool_call in enumerate(tool_call_info.tool_calls) ++ ] ++ + def _should_check_for_unstreamed_tool_arg_tokens( + self, + delta_message: DeltaMessage | None,