Skip to content

Commit 65f6782

Browse files
committed
Support string reasoning_effort
Signed-off-by: Pengyun Lin <[email protected]>
1 parent a27287a commit 65f6782

File tree

4 files changed

+28
-15
lines changed

4 files changed

+28
-15
lines changed

tensorrt_llm/serve/harmony_adapter.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import time
77
import traceback
88
import uuid
9-
from typing import Any, AsyncGenerator
9+
from typing import Any, AsyncGenerator, Literal
1010

1111
from openai_harmony import (Author, Conversation, DeveloperContent,
1212
HarmonyEncodingName, HarmonyError, Message,
@@ -189,7 +189,6 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
189189
# Send opening token if this is the first content in this channel
190190
if not self.channel_started:
191191
self.channel_started = True
192-
# print(f"Request {self.request_id}: starting preamble channel")
193192
return {
194193
"content":
195194
f"<|channel|>commentary<|message|>{self.parser.last_content_delta}",
@@ -284,7 +283,7 @@ class HarmonyAdapter:
284283
- Commentary preamble: tool_calls: [] (empty array)
285284
- Commentary tool call: tool_calls: [...] (populated array)
286285
- Final content: no tool_calls field
287-
- Analysis: reasoning_content field
286+
- Analysis: reasoning field
288287
289288
Parameters:
290289
- harmony_input: If True, expect harmony format input (no conversion)
@@ -978,8 +977,6 @@ def harmony_output_to_openai(
978977
msg_recipient = getattr(msg, 'recipient', None)
979978
msg_content = getattr(msg, 'content', [])
980979

981-
# print(f"DEBUG: Processing message - channel={msg_channel}, recipient={msg_recipient}, content={msg_content}")
982-
983980
if msg_channel == "analysis":
984981
for content in msg_content:
985982
if isinstance(content, TextContent):
@@ -1585,3 +1582,17 @@ def _create_usage_info(final_res: RequestOutput) -> UsageInfo:
15851582
completion_tokens=num_generated_tokens,
15861583
total_tokens=num_prompt_tokens + num_generated_tokens)
15871584
return usage
1585+
1586+
1587+
def maybe_transform_reasoning_effort(
1588+
reasoning_effort: ReasoningEffort | Literal["low", "medium", "high"] | None
1589+
) -> ReasoningEffort | None:
1590+
str_to_effort = {
1591+
"low": ReasoningEffort.LOW,
1592+
"medium": ReasoningEffort.MEDIUM,
1593+
"high": ReasoningEffort.HIGH
1594+
}
1595+
if reasoning_effort and not isinstance(reasoning_effort, ReasoningEffort):
1596+
return str_to_effort[reasoning_effort]
1597+
else:
1598+
return reasoning_effort

tensorrt_llm/serve/openai_protocol.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -507,12 +507,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
507507
tool_choice: Optional[Union[Literal["none", "auto"],
508508
ChatCompletionNamedToolChoiceParam]] = "none"
509509
user: Optional[str] = None
510-
reasoning_effort: Optional[ReasoningEffort] = Field(
511-
default=ReasoningEffort.LOW,
512-
description=("The level of reasoning effort to use. Controls how much "
513-
"reasoning is shown in the model's response. Options: "
514-
"'low', 'medium', 'high'."),
515-
)
510+
reasoning_effort: Optional[ReasoningEffort | Literal[
511+
"low", "medium", "high"]] = Field(
512+
default=ReasoningEffort.LOW,
513+
description=(
514+
"The level of reasoning effort to use. Controls how much "
515+
"reasoning is shown in the model's response. Options: "
516+
"'low', 'medium', 'high'."),
517+
)
516518

517519
# doc: begin-chat-completion-sampling-params
518520
best_of: Optional[int] = None

tensorrt_llm/serve/openai_server.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050

5151
from .._utils import nvtx_mark, set_prometheus_multiproc_dir
5252
from .harmony_adapter import (HarmonyAdapter, handle_non_streaming_response,
53-
handle_streaming_response)
53+
handle_streaming_response,
54+
maybe_transform_reasoning_effort)
5455

5556
# yapf: enale
5657
TIMEOUT_KEEP_ALIVE = 5 # seconds.
@@ -678,8 +679,7 @@ async def chat_harmony(self, request: ChatCompletionRequest, raw_request: Reques
678679
tools_dict = [tool.model_dump() for tool in request.tools]
679680

680681
# Reasoning effort precedence: request.reasoning_effort > system message parsing > serving default
681-
reasoning_effort = request.reasoning_effort
682-
682+
reasoning_effort = maybe_transform_reasoning_effort(request.reasoning_effort)
683683
# Get tool_choice from request
684684
tool_choice = getattr(request, 'tool_choice', None)
685685

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ l0_h100:
102102
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
103103
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
104104
- test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
105-
- test_ese.py::test_openai_chat_harmony
105+
- test_e2e.py::test_openai_chat_harmony
106106
- test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
107107
# ------------- AutoDeploy tests ---------------
108108
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype

0 commit comments

Comments
 (0)