Support string reasoning_effort

LinPoly · LinPoly · commit 65f678270fef · 2025-08-26T17:09:17.000Z
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/serve/harmony_adapter.py b/tensorrt_llm/serve/harmony_adapter.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 import uuid
-from typing import Any, AsyncGenerator
+from typing import Any, AsyncGenerator, Literal
 
 from openai_harmony import (Author, Conversation, DeveloperContent,
                             HarmonyEncodingName, HarmonyError, Message,
@@ -189,7 +189,6 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
                 # Send opening token if this is the first content in this channel
                 if not self.channel_started:
                     self.channel_started = True
-                    # print(f"Request {self.request_id}: starting preamble channel")
                     return {
                         "content":
                         f"<|channel|>commentary<|message|>{self.parser.last_content_delta}",
@@ -284,7 +283,7 @@ class HarmonyAdapter:
     - Commentary preamble: tool_calls: [] (empty array)
     - Commentary tool call: tool_calls: [...] (populated array)
     - Final content: no tool_calls field
-    - Analysis: reasoning_content field
+    - Analysis: reasoning field
 
     Parameters:
     - harmony_input: If True, expect harmony format input (no conversion)
@@ -978,8 +977,6 @@ def harmony_output_to_openai(
                 msg_recipient = getattr(msg, 'recipient', None)
                 msg_content = getattr(msg, 'content', [])
 
-                # print(f"DEBUG: Processing message - channel={msg_channel}, recipient={msg_recipient}, content={msg_content}")
-
                 if msg_channel == "analysis":
                     for content in msg_content:
                         if isinstance(content, TextContent):
@@ -1585,3 +1582,17 @@ def _create_usage_info(final_res: RequestOutput) -> UsageInfo:
                       completion_tokens=num_generated_tokens,
                       total_tokens=num_prompt_tokens + num_generated_tokens)
     return usage
+
+
+def maybe_transform_reasoning_effort(
+        reasoning_effort: ReasoningEffort | Literal["low", "medium", "high"] | None
+    ) -> ReasoningEffort | None:
+    str_to_effort = {
+        "low": ReasoningEffort.LOW,
+        "medium": ReasoningEffort.MEDIUM,
+        "high": ReasoningEffort.HIGH
+    }
+    if reasoning_effort and not isinstance(reasoning_effort, ReasoningEffort):
+        return str_to_effort[reasoning_effort]
+    else:
+        return reasoning_effort
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
@@ -507,12 +507,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
     tool_choice: Optional[Union[Literal["none", "auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
     user: Optional[str] = None
-    reasoning_effort: Optional[ReasoningEffort] = Field(
-        default=ReasoningEffort.LOW,
-        description=("The level of reasoning effort to use. Controls how much "
-                     "reasoning is shown in the model's response. Options: "
-                     "'low', 'medium', 'high'."),
-    )
+    reasoning_effort: Optional[ReasoningEffort | Literal[
+        "low", "medium", "high"]] = Field(
+            default=ReasoningEffort.LOW,
+            description=(
+                "The level of reasoning effort to use. Controls how much "
+                "reasoning is shown in the model's response. Options: "
+                "'low', 'medium', 'high'."),
+        )
 
     # doc: begin-chat-completion-sampling-params
     best_of: Optional[int] = None
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -50,7 +50,8 @@
 
 from .._utils import nvtx_mark, set_prometheus_multiproc_dir
 from .harmony_adapter import (HarmonyAdapter, handle_non_streaming_response,
-                              handle_streaming_response)
+                              handle_streaming_response,
+                              maybe_transform_reasoning_effort)
 
 # yapf: enale
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@@ -678,8 +679,7 @@ async def chat_harmony(self, request: ChatCompletionRequest, raw_request: Reques
                 tools_dict = [tool.model_dump() for tool in request.tools]
 
             # Reasoning effort precedence: request.reasoning_effort > system message parsing > serving default
-            reasoning_effort = request.reasoning_effort
-
+            reasoning_effort = maybe_transform_reasoning_effort(request.reasoning_effort)
             # Get tool_choice from request
             tool_choice = getattr(request, 'tool_choice', None)
 
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -102,7 +102,7 @@ l0_h100:
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
   - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
-  - test_ese.py::test_openai_chat_harmony
+  - test_e2e.py::test_openai_chat_harmony
   - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
   # ------------- AutoDeploy tests ---------------
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype