diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 5a69dfd884c..df5f1d7f3d2 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -251,6 +251,8 @@ Qwen/Qwen2-0.5B-Instruct: accuracy: 30.930 - quant_algo: FP8 accuracy: 31.140 +Qwen/Qwen2-1.5B: + - accuracy: 32.58 Qwen/Qwen2-7B-Instruct: - accuracy: 36.148 - quant_algo: W8A16 @@ -275,6 +277,9 @@ Qwen/Qwen2.5-7B-Instruct: - accuracy: 33.014 - quant_algo: FP8 accuracy: 33.248 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 33.248 nvidia/Nemotron-Mini-4B-Instruct: - quant_algo: FP8 accuracy: 25.247 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 999819c328d..6e2d759a3be 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -75,6 +75,9 @@ Qwen/Qwen2.5-7B-Instruct: - accuracy: 75.32 - quant_algo: FP8 accuracy: 75.32 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 75.32 deepseek-ai/DeepSeek-V3-Lite: - accuracy: 71.40 - quant_algo: NVFP4 diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py index cae3e823269..881e83f3755 100644 --- a/tests/integration/defs/accuracy/test_cli_flow.py +++ b/tests/integration/defs/accuracy/test_cli_flow.py @@ -1148,6 +1148,17 @@ def test_fp8(self): quant_algo=QuantAlgo.FP8) +class TestQwen2_1_5B(CliFlowAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen2-1.5B" + MODEL_PATH = f"{llm_models_root()}/Qwen2-1.5B" + EXAMPLE_FOLDER = "models/core/qwen" + + @pytest.mark.skip_less_device(4) + def test_auto_dtype_cp4(self): + "RCCA: https://nvbugs/5170106" + self.run(dtype='auto', cp_size=4) + + class TestQwen2_7BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-7B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct" diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index f9387c3fc68..d97f518616e 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -277,3 +277,16 @@ def test_fp8(self): extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) task = MMLU(self.MODEL_NAME) task.evaluate(llm) + + @pytest.mark.skip(reason="https://nvbugs/5280461") + @skip_pre_ada + def test_fp8_kvcache(self): + "RCCA: https://nvbugs/5065080" + quant_config = QuantConfig(QuantAlgo.FP8, + kv_cache_quant_algo=QuantAlgo.FP8) + with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm, + extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index e88ec422b8d..32c676283a0 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -319,6 +319,67 @@ def test_mistral_e2e(llama_example_root, llama_tokenizer_model_root, llm_venv, venv_check_call(llm_venv, run_cmd) +@pytest.mark.parametrize("model_name,model_path", [ + ("DeepSeek-R1-Distill-Qwen-1.5B", "DeepSeek-R1-Distill-Qwen-1.5B"), +]) +def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv, + qwen_example_root, cmodel_dir, + engine_dir): + "RCCA: https://nvbugs/5238105" + model_dir = convert_weights( + llm_venv=llm_venv, + example_root=qwen_example_root, + cmodel_dir=cmodel_dir, + model=model_name, + model_path=f"{llm_models_root()}/{model_path}", + ) + + build_cmd = [ + "trtllm-build", f"--checkpoint_dir={model_dir}", + f"--output_dir={engine_dir}", f"--gemm_plugin=float16", + "--max_num_tokens=32768" + ] + + check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env) + + from transformers import AutoTokenizer + + from tensorrt_llm.runtime import PYTHON_BINDINGS + + if PYTHON_BINDINGS: + from tensorrt_llm.runtime import ModelRunnerCpp + tokenizer = AutoTokenizer.from_pretrained( + f"{llm_models_root()}/{model_path}", + trust_remote_code=True, + use_fast=False) + + message = r"<|begin▁of▁sentence|><|User|>The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$. Let's think step by step and output the final answer within \boxed{}.<|Assistant|>" + + inputs = tokenizer(message, return_tensors='pt', + add_special_tokens=False)['input_ids'] + + runner = ModelRunnerCpp.from_dir(engine_dir=f"{engine_dir}", + max_input_len=128, + max_output_len=4096, + max_batch_size=8) + + outputs = runner.generate(inputs, + end_id=tokenizer.eos_token_id, + pad_id=tokenizer.pad_token_id, + temperature=0.6, + top_p=1.0, + top_k=1024, + max_new_tokens=1024, + return_dict=True, + min_length=1, + num_return_sequences=4, + output_sequence_lengths=True) + + seq_lengths = outputs['sequence_lengths'] + assert not (seq_lengths == 0).any( + ), f"Found zero length in sequence_lengths tensor: {seq_lengths}" + + def trtllm_bench_prolog( llm_root, llm_venv, diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 4e6504d4cc1..e974a19d5eb 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -402,6 +402,7 @@ accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_weight_only accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_auto_dtype accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_weight_only accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_fp8 +accuracy/test_cli_flow.py::TestQwen2_1_5B::test_auto_dtype_cp4 accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_auto_dtype accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized @@ -414,6 +415,7 @@ accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_fp8 accuracy/test_llm_api.py::TestQwen2_5_0_5BInstruct::test_fp8 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_fp8 accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8 +accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4] accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq] accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq] @@ -461,6 +463,7 @@ llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-m test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--] test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--] test_e2e.py::test_mistral_e2e[use_py_session---] +test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B] test_e2e.py::test_openai_multi_chat_example test_e2e.py::test_openai_consistent_chat llmapi/test_llm_examples.py::test_llmapi_server_example