From 7fe0d40de4fc61810afbf7e16da988c33a8a2bd6 Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:13:43 +0000
Subject: [PATCH 1/5] [https://nvbugs/5375594][fix] fix oom issue on
structural_tag test case
Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
tests/integration/test_lists/waives.txt | 1 -
.../unittest/llmapi/apps/_test_openai_chat_structural_tag.py | 5 ++++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2e0037bcfc5..3597a6baa3e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -43,7 +43,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bf
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
-test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index e3411404947..e1bf48f31c6 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -23,7 +23,10 @@ def temp_extra_llm_api_options_file(request):
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
- extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
+ extra_llm_api_options_dict = {
+ "guided_decoding_backend": "xgrammar",
+ "max_batch_size": 32
+ }
with open(temp_file_path, 'w') as f:
yaml.dump(extra_llm_api_options_dict, f)
From 0e2a5c13efe0966bcb7483765ff79e03cab93872 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 02:07:07 +0000
Subject: [PATCH 2/5] fix test
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
.../apps/_test_openai_chat_structural_tag.py | 56 +++++++++++--------
1 file changed, 33 insertions(+), 23 deletions(-)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index e1bf48f31c6..c35aa182f00 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -1,32 +1,32 @@
# Adapted from
# https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_chat.py
+import json
import os
+import re
import tempfile
+import jsonschema
import openai
import pytest
import yaml
-from ..test_llm import get_model_path, similar
+from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer
pytestmark = pytest.mark.threadleak(enabled=False)
-@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+@pytest.fixture(scope="module")
def model_name():
return "llama-3.1-model/Llama-3.1-8B-Instruct"
@pytest.fixture(scope="module")
-def temp_extra_llm_api_options_file(request):
+def temp_extra_llm_api_options_file():
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
- extra_llm_api_options_dict = {
- "guided_decoding_backend": "xgrammar",
- "max_batch_size": 32
- }
+ extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
with open(temp_file_path, 'w') as f:
yaml.dump(extra_llm_api_options_dict, f)
@@ -40,7 +40,11 @@ def temp_extra_llm_api_options_file(request):
@pytest.fixture(scope="module")
def server(model_name: str, temp_extra_llm_api_options_file: str):
model_path = get_model_path(model_name)
- args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+
+ args = [
+ "--max_batch_size=32",
+ f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
+ ]
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
@@ -115,12 +119,7 @@ def tool_get_current_date():
def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
tool_get_current_weather, tool_get_current_date):
- messages = [
- {
- "role":
- "system",
- "content":
- f"""
+ system_prompt = f"""
# Tool Instructions
- Always execute python code in messages that you share.
- When looking for real time information use relevant functions if available else fallback to brave_search
@@ -143,13 +142,17 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
-You are a helpful assistant.""",
+You are a helpful assistant."""
+ user_prompt = "You are in New York. Please get the current date and time, and the weather."
+
+ messages = [
+ {
+ "role": "system",
+ "content": system_prompt,
},
{
- "role":
- "user",
- "content":
- "You are in New York. Please get the current date and time, and the weather.",
+ "role": "user",
+ "content": user_prompt,
},
]
@@ -176,11 +179,18 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
"triggers": ["([\S\s]+?)',
+ message.content)
+ params = json.loads(match.group(1))
+ jsonschema.validate(params,
+ tool_get_current_weather["function"]["parameters"])
+
+ match = re.search(r'([\S\s]+?)',
+ message.content)
+ params = json.loads(match.group(1))
+ jsonschema.validate(params, tool_get_current_date["function"]["parameters"])
From 671ac9bd3442ac82027742e0317228a8db10aba2 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 02:17:03 +0000
Subject: [PATCH 3/5] add req
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
requirements-dev.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index c8293761eaa..986026e5303 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -24,6 +24,7 @@ cloudpickle
typing-extensions==4.12.2
bandit==1.7.7
jsonlines==4.0.0
+jsonschema
jieba==0.42.1
rouge==1.0.1
pytest-rerunfailures
From c597d603dc78e5e51a53cf6a06cb8db0b8dab9fb Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:05:05 +0000
Subject: [PATCH 4/5] fix oom
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
.../unittest/llmapi/apps/_test_openai_chat_structural_tag.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index c35aa182f00..022b5a89863 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -41,8 +41,9 @@ def temp_extra_llm_api_options_file():
def server(model_name: str, temp_extra_llm_api_options_file: str):
model_path = get_model_path(model_name)
+ # Use small max_batch_size/max_seq_len/max_num_tokens to avoid OOM on A10/A30 GPUs.
args = [
- "--max_batch_size=32",
+ "--max_batch_size=8", "--max_seq_len=1024", "--max_num_tokens=1024",
f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
]
with RemoteOpenAIServer(model_path, args) as remote_server:
@@ -159,7 +160,7 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
chat_completion = client.chat.completions.create(
model=model_name,
messages=messages,
- max_completion_tokens=100,
+ max_completion_tokens=256,
response_format={
"type":
"structural_tag",
From 64123876dace0aabe2ef681baa630cf0c39294db Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 07:34:52 +0000
Subject: [PATCH 5/5] clean
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
requirements-dev.txt | 1 -
tests/unittest/llmapi/apps/_test_openai_chat_json.py | 6 +-----
2 files changed, 1 insertion(+), 6 deletions(-)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 986026e5303..c8293761eaa 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -24,7 +24,6 @@ cloudpickle
typing-extensions==4.12.2
bandit==1.7.7
jsonlines==4.0.0
-jsonschema
jieba==0.42.1
rouge==1.0.1
pytest-rerunfailures
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_json.py b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
index a444b5566b8..53651828507 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_json.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
@@ -26,11 +26,7 @@ def temp_extra_llm_api_options_file(request):
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
- extra_llm_api_options_dict = {
- "guided_decoding_backend": "xgrammar",
- "disable_overlap_scheduler":
- True, # Guided decoding is not supported with overlap scheduler
- }
+ extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
with open(temp_file_path, "w") as f:
yaml.dump(extra_llm_api_options_dict, f)