Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bf
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
Expand Down
6 changes: 1 addition & 5 deletions tests/unittest/llmapi/apps/_test_openai_chat_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@ def temp_extra_llm_api_options_file(request):
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
extra_llm_api_options_dict = {
"guided_decoding_backend": "xgrammar",
"disable_overlap_scheduler":
True, # Guided decoding is not supported with overlap scheduler
}
extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}

with open(temp_file_path, "w") as f:
yaml.dump(extra_llm_api_options_dict, f)
Expand Down
54 changes: 34 additions & 20 deletions tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
# Adapted from
# https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_chat.py
import json
import os
import re
import tempfile

import jsonschema
import openai
import pytest
import yaml

from ..test_llm import get_model_path, similar
from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer

pytestmark = pytest.mark.threadleak(enabled=False)


@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
@pytest.fixture(scope="module")
def model_name():
return "llama-3.1-model/Llama-3.1-8B-Instruct"


@pytest.fixture(scope="module")
def temp_extra_llm_api_options_file(request):
def temp_extra_llm_api_options_file():
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
Expand All @@ -37,7 +40,12 @@ def temp_extra_llm_api_options_file(request):
@pytest.fixture(scope="module")
def server(model_name: str, temp_extra_llm_api_options_file: str):
model_path = get_model_path(model_name)
args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]

# Use small max_batch_size/max_seq_len/max_num_tokens to avoid OOM on A10/A30 GPUs.
args = [
"--max_batch_size=8", "--max_seq_len=1024", "--max_num_tokens=1024",
f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
]
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server

Expand Down Expand Up @@ -112,12 +120,7 @@ def tool_get_current_date():

def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
tool_get_current_weather, tool_get_current_date):
messages = [
{
"role":
"system",
"content":
f"""
system_prompt = f"""
# Tool Instructions
- Always execute python code in messages that you share.
- When looking for real time information use relevant functions if available else fallback to brave_search
Expand All @@ -140,20 +143,24 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.""",
You are a helpful assistant."""
user_prompt = "You are in New York. Please get the current date and time, and the weather."

messages = [
{
"role": "system",
"content": system_prompt,
},
{
"role":
"user",
"content":
"You are in New York. Please get the current date and time, and the weather.",
"role": "user",
"content": user_prompt,
},
]

chat_completion = client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=100,
max_completion_tokens=256,
response_format={
"type":
"structural_tag",
Expand All @@ -173,11 +180,18 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
"triggers": ["<function="],
},
)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1

message = chat_completion.choices[0].message
assert message.content is not None
assert message.role == "assistant"

reference = '<function=get_current_date>{"timezone": "America/New_York"}</function>\n<function=get_current_weather>{"city": "New York", "state": "NY", "unit": "fahrenheit"}</function>\n\nSources:\n- get_current_date function\n- get_current_weather function'
assert similar(chat_completion.choices[0].message.content, reference)
match = re.search(r'<function=get_current_weather>([\S\s]+?)</function>',
message.content)
params = json.loads(match.group(1))
jsonschema.validate(params,
tool_get_current_weather["function"]["parameters"])

match = re.search(r'<function=get_current_date>([\S\s]+?)</function>',
message.content)
params = json.loads(match.group(1))
jsonschema.validate(params, tool_get_current_date["function"]["parameters"])