Support Responses Streaming (vllm-project#21)

simon-mo · web-flow · commit b775a399185e · 2025-08-05T01:50:16.000-07:00
diff --git a/responses_api.py b/responses_api.py
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
+<<<<<<< HEAD
 vllm serve /data/woosuk/os-mini-weights/pytorch-rc-20b --enforce-eager
+=======
+vllm serve /data/woosuk/os-mini-weights/pytorch-rc-20b \
+    --tokenizer /data/xmo/os-mini/models/hf-converted --enforce-eager
+>>>>>>> 4a52d33d8 (streaming support)
 """
 import argparse
 import json
@@ -230,16 +235,48 @@ def test_stateful_multi_turn():
 
 
 def test_streaming():
-    response = client.responses.create(
-        model=MODEL,
-        input="What is 13 * 24? Explain your answer.",
-        stream=True,
-    )
+    promts = [
+        "tell me a story about a cat in 20 words",
+        "What is 13 * 24? Use python to calculate the result.",
+        "When did Jensen found NVIDIA? Search it and answer the year only."
+    ]
+    for prompt in promts:
+        print(f"\n{prompt}\n")
+        response = client.responses.create(
+            model=MODEL,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[{
+                "type": "web_search_preview"
+            }, {
+                "type": "code_interpreter",
+                "container": {
+                    "type": "auto"
+                }
+            }],
+            stream=True,
+        )
+
+        events = []
+        current_event_mode = None
+
+        for event in response:
+            if current_event_mode != event.type:
+                current_event_mode = event.type
+                print(f"\n[{event.type}] ", end="", flush=True)
+
+            if "text.delta" in event.type:
+                print(event.delta, end="", flush=True)
+            elif "reasoning_text.delta" in event.type:
+                print(f"{event.delta}", end="", flush=True)
+            elif "response.code_interpreter_call_code.done" in event.type:
+                print(f"Code: {event.code}", end="", flush=True)
+            elif ("response.output_item.added" in event.type
+                  and event.item.type == "web_search_call"):
+                print(f"Web search: {event.item.action}", end="", flush=True)
+            events.append(event)
 
-    for event in response:
-        if "text.delta" in event.type:
-            print(event.delta, end="", flush=True)
-    print()
+        print("\n--------------------------------\n")
 
 
 def test_web_search():
@@ -600,8 +637,8 @@ def test_function_calling_full_history():
     test_stateful_multi_turn()
 
     # 3. Streaming tests:
-    # print("===test_streaming:")
-    # test_streaming()
+    print("===test_streaming:")
+    test_streaming()
 
     # 4. Tool tests:
     print("===test_web_search:")
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
@@ -3,8 +3,11 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Optional
 
-from vllm.entrypoints.harmony_utils import (parse_output_into_messages,
-                                            render_for_completion)
+from openai_harmony import Role, StreamState
+
+from vllm.entrypoints.harmony_utils import (
+    get_encoding, get_streamable_parser_for_assistant,
+    parse_output_into_messages, render_for_completion)
 from vllm.outputs import RequestOutput
 
 if TYPE_CHECKING:
@@ -51,7 +54,7 @@ def __init__(
         browser_tool,
         python_tool,
     ):
-        self.messages = messages
+        self._messages = messages
         self.browser_tool = browser_tool
         self.python_tool = python_tool
 
@@ -60,16 +63,20 @@ def __init__(
         self.num_prompt_tokens = 0
         self.num_cached_tokens = 0
         self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
 
     def append_output(self, output) -> None:
-        # TODO: Support streaming.
         if isinstance(output, RequestOutput):
             output_token_ids = output.outputs[0].token_ids
             output_msgs = parse_output_into_messages(output_token_ids)
         else:
             # Tool output.
             output_msgs = output
-        self.messages.extend(output_msgs)
+        self._messages.extend(output_msgs)
+
+    @property
+    def messages(self) -> list:
+        return self._messages
 
     def get_tool_call(self) -> Optional["Tool"]:
         last_msg = self.messages[-1]
@@ -83,3 +90,59 @@ def get_tool_call(self) -> Optional["Tool"]:
 
     def render_for_completion(self) -> list[int]:
         return render_for_completion(self.messages)
+
+
+class StreamingHarmonyContext(HarmonyContext):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+
+    def append_output(self, output) -> None:
+        if isinstance(output, RequestOutput):
+            tok = output.outputs[0].token_ids[0]
+            self.parser.process(tok)
+            self.last_tok = tok
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions(
+        )
+
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+
+        return rendered_tokens
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
@@ -209,15 +209,16 @@ def parse_output_message(message: Message):
             raise ValueError("Invalid number of contents in browser message")
         content = message.content[0]
         browser_call = json.loads(content.text)
+        # TODO: translate to url properly!
         if recipient == "browser.search":
-            action = ActionSearch(query=browser_call["query"], type="search")
+            action = ActionSearch(
+                query=f"cursor:{browser_call.get('query', '')}", type="search")
         elif recipient == "browser.open":
-            url = ""  # FIXME: browser_call["url"]
-            action = ActionOpenPage(url=url, type="open_page")
+            action = ActionOpenPage(
+                url=f"cursor:{browser_call.get('url', '')}", type="open_page")
         elif recipient == "browser.find":
-            url = ""  # FIXME: browser_call["url"]
             action = ActionFind(pattern=browser_call["pattern"],
-                                url=url,
+                                url=f"cursor:{browser_call.get('url', '')}",
                                 type="find")
         else:
             raise ValueError(f"Unknown browser action: {recipient}")
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -18,6 +18,7 @@
     Annotation as OpenAIAnnotation)
 # yapf: enable
 from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseFunctionToolCallOutputItem,
                                     ResponseInputItemParam, ResponseOutputItem,
                                     ResponsePrompt, ResponseStatus,
                                     ResponseTextConfig)
@@ -1731,6 +1732,60 @@ class ResponseReasoningItem(OpenAIBaseModel):
     status: Optional[Literal["in_progress", "completed", "incomplete"]]
 
 
+class InputTokensDetails(OpenAIBaseModel):
+    cached_tokens: int
+
+
+class OutputTokensDetails(OpenAIBaseModel):
+    reasoning_tokens: int
+
+
+class ResponseUsage(OpenAIBaseModel):
+    input_tokens: int
+    input_tokens_details: InputTokensDetails
+    output_tokens: int
+    output_tokens_details: OutputTokensDetails
+    total_tokens: int
+
+
+class ResponseReasoningTextDeltaEvent(OpenAIBaseModel):
+    type: Literal[
+        "response.reasoning_text.delta"] = "response.reasoning_text.delta"
+    item_id: str = "item_1234"
+    output_index: int
+    content_index: int
+    delta: str
+    sequence_number: int = -1
+
+
+class ResponseReasoningTextDoneEvent(OpenAIBaseModel):
+    type: Literal[
+        "response.reasoning_text.done"] = "response.reasoning_text.done"
+    item_id: str = "item_1234"
+    output_index: int
+    content_index: int
+    text: str
+    sequence_number: int = -1
+
+
+class ResponseContentPartDoneEvent(OpenAIBaseModel):
+    type: Literal["response.content_part.done"] = "response.content_part.done"
+    item_id: str = "item_1234"
+    output_index: int
+    content_index: int
+    part: Union[ResponseOutputItem, ResponseReasoningItem]
+    sequence_number: int = -1
+
+
+class ResponseOutputItemDoneEvent(OpenAIBaseModel):
+    type: Literal["response.output_item.done"] = "response.output_item.done"
+    item_id: str = "item_1234"
+    output_index: int
+    item: Union[ResponseOutputItem, ResponseReasoningItem,
+                ResponseFunctionToolCallOutputItem]
+    sequence_number: int = -1
+
+
 class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
     created_at: int = Field(default_factory=lambda: int(time.time()))
@@ -1757,7 +1812,7 @@ class ResponsesResponse(OpenAIBaseModel):
     text: Optional[ResponseTextConfig] = None
     top_logprobs: int
     truncation: Literal["auto", "disabled"]
-    usage: Optional[UsageInfo] = None
+    usage: Optional[ResponseUsage] = None
     user: Optional[str] = None
 
     @classmethod
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -990,6 +990,9 @@ async def _generate_with_builtin_tools(
             tool_output = await tool.get_result(context)
             context.append_output(tool_output)
 
+            # TODO: uncomment this and enable tool output streaming
+            # yield context
+
             # Create inputs for the next turn.
             # Render the next prompt token ids.
             prompt_token_ids = context.render_for_completion()
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py