[Bugfix] Fix harmony library format mismatch for streaming tool calls

levunet · levunet · commit c285452b22f5 · 2025-09-12T18:09:45.000+09:00
This commit addresses an issue where the harmony library passes data to the model
in a different format than what the model outputs, causing the model to become
confused and stop responding when using /v1/chat/completions with stream and tools.

The fix updates the message format to match the model's expected output:
- Move recipient info from assistant start tag to channel tag
- Change content type from 'json' to '&lt;|constrain|&gt;json'
- Replace &lt;|end|&gt; token with &lt;|call|&gt; token for tool calls

This is a temporary fix until the underlying format mismatch is properly resolved.

Signed-off-by: kyt &lt;eluban4532@gmail.com&gt;
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
@@ -16,9 +16,9 @@
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
 from openai.types.responses.tool import Tool
-from openai_harmony import (Author, ChannelConfig, Conversation,
-                            DeveloperContent, HarmonyEncodingName, Message,
-                            ReasoningEffort, Role, StreamableParser,
+from openai_harmony import (Author, ChannelConfig, DeveloperContent,
+                            HarmonyEncodingName, Message, ReasoningEffort,
+                            RenderOptions, Role, StreamableParser,
                             SystemContent, TextContent, ToolDescription,
                             load_harmony_encoding)
 
@@ -213,14 +213,18 @@ def parse_chat_input(chat_msg) -> list[Message]:
     tool_calls = chat_msg.get("tool_calls")
     if role == "assistant" and tool_calls:
         msgs: list[Message] = []
+        content = chat_msg.get("content") or ""
+        analysis_msg = Message.from_role_and_content(Role.ASSISTANT, content)
+        analysis_msg = analysis_msg.with_channel("analysis")
+        msgs.append(analysis_msg)
+
         for call in tool_calls:
             func = call.get("function", {})
             name = func.get("name", "")
             arguments = func.get("arguments", "") or ""
             msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
-            msg = msg.with_channel("commentary")
-            msg = msg.with_recipient(f"functions.{name}")
-            msg = msg.with_content_type("json")
+            msg = msg.with_channel(f"commentary to=functions.{name}")
+            msg.with_content_type("json")
             msgs.append(msg)
         return msgs
 
@@ -230,7 +234,7 @@ def parse_chat_input(chat_msg) -> list[Message]:
         content = chat_msg.get("content", "") or ""
         msg = Message.from_author_and_content(
             Author.new(Role.TOOL, f"functions.{name}"),
-            content).with_channel("commentary")
+            content).with_channel("commentary").with_recipient("assistant")
         return [msg]
 
     # Default: user/assistant/system messages with content
@@ -245,9 +249,35 @@ def parse_chat_input(chat_msg) -> list[Message]:
 
 
 def render_for_completion(messages: list[Message]) -> list[int]:
-    conversation = Conversation.from_messages(messages)
-    token_ids = get_encoding().render_conversation_for_completion(
-        conversation, Role.ASSISTANT)
+    if not messages:
+        return []
+
+    token_ids = []
+    encoding = get_encoding()
+    end_token_ids = encoding.encode("<|end|>", allowed_special={"<|end|>"})
+    call_token_ids = encoding.encode("<|call|>", allowed_special={"<|call|>"})
+
+    has_function_tools = any(
+        msg.author.role == Role.DEVELOPER and msg.content[0] and hasattr(
+            msg.content[0], 'tools') and msg.content[0].tools is not None
+        and msg.content[0].tools["functions"] is not None for msg in messages)
+
+    for i, msg in enumerate(messages):
+        msg_tokens = encoding.render(
+            msg,
+            RenderOptions(conversation_has_function_tools=has_function_tools))
+        is_tool_call = (msg.author.role == Role.ASSISTANT and msg.channel
+                        and "functions." in msg.channel)
+        if (i < len(messages) - 1 and is_tool_call and end_token_ids
+                and call_token_ids and msg_tokens
+                and msg_tokens[-1] == end_token_ids[0]):
+            msg_tokens[-1] = call_token_ids[0]
+        token_ids.extend(msg_tokens)
+
+    start_assistant_tokens = encoding.encode("<|start|>assistant",
+                                             allowed_special={"<|start|>"})
+    token_ids.extend(start_assistant_tokens)
+
     return token_ids
 
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -1569,7 +1569,9 @@ def _make_request_with_harmony(
         sys_msg = get_system_message(
             reasoning_effort=request.reasoning_effort,
             browser_description=None,
-            python_description=None)
+            python_description=None,
+            with_custom_tools=request.tools is not None
+            )
         messages.append(sys_msg)
 
         # Add developer message.