Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tests/integration/defs/accuracy/references/cnn_dailymail.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ Qwen/Qwen2-0.5B-Instruct:
accuracy: 30.930
- quant_algo: FP8
accuracy: 31.140
Qwen/Qwen2-1.5B:
- accuracy: 32.58
Qwen/Qwen2-7B-Instruct:
- accuracy: 36.148
- quant_algo: W8A16
Expand All @@ -275,6 +277,9 @@ Qwen/Qwen2.5-7B-Instruct:
- accuracy: 33.014
- quant_algo: FP8
accuracy: 33.248
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 33.248
nvidia/Nemotron-Mini-4B-Instruct:
- quant_algo: FP8
accuracy: 25.247
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ Qwen/Qwen2.5-7B-Instruct:
- accuracy: 75.32
- quant_algo: FP8
accuracy: 75.32
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 75.32
deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 71.40
- quant_algo: NVFP4
Expand Down
11 changes: 11 additions & 0 deletions tests/integration/defs/accuracy/test_cli_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,17 @@ def test_fp8(self):
quant_algo=QuantAlgo.FP8)


class TestQwen2_1_5B(CliFlowAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen2-1.5B"
MODEL_PATH = f"{llm_models_root()}/Qwen2-1.5B"
EXAMPLE_FOLDER = "models/core/qwen"

@pytest.mark.skip_less_device(4)
def test_auto_dtype_cp4(self):
"RCCA: https://nvbugs/5170106"
self.run(dtype='auto', cp_size=4)


class TestQwen2_7BInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/defs/accuracy/test_llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,16 @@ def test_fp8(self):
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip(reason="https://nvbugs/5280461")
@skip_pre_ada
def test_fp8_kvcache(self):
"RCCA: https://nvbugs/5065080"
quant_config = QuantConfig(QuantAlgo.FP8,
kv_cache_quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
61 changes: 61 additions & 0 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,67 @@ def test_mistral_e2e(llama_example_root, llama_tokenizer_model_root, llm_venv,
venv_check_call(llm_venv, run_cmd)


@pytest.mark.parametrize("model_name,model_path", [
("DeepSeek-R1-Distill-Qwen-1.5B", "DeepSeek-R1-Distill-Qwen-1.5B"),
])
def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv,
qwen_example_root, cmodel_dir,
engine_dir):
"RCCA: https://nvbugs/5238105"
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=qwen_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=f"{llm_models_root()}/{model_path}",
)

build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", f"--gemm_plugin=float16",
"--max_num_tokens=32768"
]

check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

from transformers import AutoTokenizer

from tensorrt_llm.runtime import PYTHON_BINDINGS

if PYTHON_BINDINGS:
from tensorrt_llm.runtime import ModelRunnerCpp
tokenizer = AutoTokenizer.from_pretrained(
f"{llm_models_root()}/{model_path}",
trust_remote_code=True,
use_fast=False)

message = r"<|begin▁of▁sentence|><|User|>The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.<|Assistant|>"

inputs = tokenizer(message, return_tensors='pt',
add_special_tokens=False)['input_ids']

runner = ModelRunnerCpp.from_dir(engine_dir=f"{engine_dir}",
max_input_len=128,
max_output_len=4096,
max_batch_size=8)

outputs = runner.generate(inputs,
end_id=tokenizer.eos_token_id,
pad_id=tokenizer.pad_token_id,
temperature=0.6,
top_p=1.0,
top_k=1024,
max_new_tokens=1024,
return_dict=True,
min_length=1,
num_return_sequences=4,
output_sequence_lengths=True)

seq_lengths = outputs['sequence_lengths']
assert not (seq_lengths == 0).any(
), f"Found zero length in sequence_lengths tensor: {seq_lengths}"


def trtllm_bench_prolog(
llm_root,
llm_venv,
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_fp8
accuracy/test_cli_flow.py::TestQwen2_1_5B::test_auto_dtype_cp4
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized
Expand All @@ -414,6 +415,7 @@ accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_0_5BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4]
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq]
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq]
Expand Down Expand Up @@ -461,6 +463,7 @@ llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-m
test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--]
test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--]
test_e2e.py::test_mistral_e2e[use_py_session---]
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
test_e2e.py::test_openai_multi_chat_example
test_e2e.py::test_openai_consistent_chat
llmapi/test_llm_examples.py::test_llmapi_server_example
Expand Down