Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tests/integration/defs/accuracy/references/cnn_dailymail.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ meta-llama/Llama-3.2-1B:
accuracy: 27.259
- extra_acc_spec: max_attention_window_size=960;beam_width=4
accuracy: 0
meta-llama/Llama-3.2-3B:
- accuracy: 25.495
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 33.629
meta-llama/Llama-3.3-70B-Instruct:
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
Expand Down
5 changes: 5 additions & 0 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
accuracy: 92.42
nvidia/Nemotron-H-8B-Base-8K:
- accuracy: 46.20
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 85.78
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 37.15
- quant_algo: FP8
Expand All @@ -87,3 +90,5 @@ nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 94.16
kanana-1.5-2.1b-instruct-2505:
- accuracy: 75.81
10 changes: 10 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ meta-llama/Llama-3.2-1B:
accuracy: 33.87
- extra_acc_spec: max_attention_window_size=960
accuracy: 32.82
meta-llama/Llama-3.2-3B:
- accuracy: 57.92
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 60.60
meta-llama/Llama-3.3-70B-Instruct:
- accuracy: 81.31
- quant_algo: NVFP4
Expand Down Expand Up @@ -162,10 +167,15 @@ nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
accuracy: 57.12
nvidia/Nemotron-H-8B-Base-8K:
- accuracy: 69.590
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 69.180
microsoft/Phi-4-mini-instruct:
- accuracy: 68.98
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
- accuracy: 83.70
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 83.36
kanana-1.5-2.1b-instruct-2505:
- accuracy: 56.89
75 changes: 74 additions & 1 deletion tests/integration/defs/accuracy/test_llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
import pytest

from tensorrt_llm.llmapi import LLM, EagleDecodingConfig
from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo

Expand Down Expand Up @@ -74,6 +74,79 @@ def test_guided_decoding_4gpus(self):
task.evaluate(llm)


class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-3.2-1B"
MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B"
EXAMPLE_FOLDER = "models/core/llama"

def test_auto_dtype(self):
with LLM(self.MODEL_PATH) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_smooth_quant(self):
quant_config = QuantConfig(
QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_smooth_quant_ootb(self):
quant_config = QuantConfig(QuantAlgo.W8A8_SQ_PER_CHANNEL)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_int4_awq(self):
quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_int4_awq_int8_kv_cache(self):
quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.INT8)
with LLM(self.MODEL_PATH,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
def test_fp8(self):
quant_config = QuantConfig(QuantAlgo.FP8)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
@pytest.mark.skip_less_device(2)
def test_fp8_pp2(self):
quant_config = QuantConfig(QuantAlgo.FP8)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH,
pipeline_parallel_size=2,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise(self):
quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)


class TestMistral7B_0_3(LlmapiAccuracyTestHarness):
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
MODEL_PATH = f"{llm_models_root()}/Mistral-7B-Instruct-v0.3"
Expand Down
99 changes: 50 additions & 49 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,66 +300,36 @@ def test_auto_dtype(self):
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_smooth_quant(self):
quant_config = QuantConfig(
QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
def test_fp8_prequantized(self):
model_path = f"{llm_models_root()}/ llama-3.2-models/Llama-3.2-1B-FP8"
with LLM(model_path) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_smooth_quant_ootb(self):
quant_config = QuantConfig(QuantAlgo.W8A8_SQ_PER_CHANNEL)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_int4_awq(self):
quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
class TestLlama3_2_3B(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-3.2-3B"
MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-3B"
EXAMPLE_FOLDER = "models/core/llama"

@skip_post_blackwell
def test_int4_awq_int8_kv_cache(self):
quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.INT8)
with LLM(self.MODEL_PATH,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
def test_auto_dtype(self):
with LLM(self.MODEL_PATH) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
def test_fp8(self):
quant_config = QuantConfig(QuantAlgo.FP8)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
@pytest.mark.skip_less_device(2)
def test_fp8_pp2(self):
quant_config = QuantConfig(QuantAlgo.FP8)
kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH,
pipeline_parallel_size=2,
quant_config=quant_config,
kv_cache_config=kv_cache_config) as llm:
@skip_pre_hopper
def test_fp8_prequantized(self):
model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-3B-Instruct-FP8"
with LLM(model_path) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise(self):
quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)


Expand Down Expand Up @@ -1250,6 +1220,19 @@ def test_auto_dtype(self):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
def test_reasoning_fp8_prequantized(self):
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
kv_cache_config=kv_cache_config,
max_batch_size=256) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)


class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
Expand Down Expand Up @@ -1476,3 +1459,21 @@ def test_auto_dtype(self):
task = GPQADiamond(self.MODEL_NAME)
task.evaluate(llm,
extra_evaluator_kwargs=dict(apply_chat_template=True))


class TestKanana_Instruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "kanana-1.5-2.1b-instruct-2505"
MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505"

@pytest.mark.skip_not_contain(["H20", "H100"])
def test_auto_dtype(self):
"RCCA: https://nvbugspro.nvidia.com/bug/5310520"
pytorch_config = dict(duse_cuda_graph=True,
cuda_graph_padding_enabled=True,
cuda_graph_max_batch_size=384)
with LLM(self.MODEL_PATH, **pytorch_config,
enable_attention_dp=True) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
22 changes: 15 additions & 7 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -376,13 +376,16 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_weight_streaming[1.0]
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_smooth_quant
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_smooth_quant_ootb
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_int4_awq
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_pp2
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_rowwise
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant_ootb
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_cli_flow.py::TestMistral7B::test_beam_search
accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2
accuracy/test_cli_flow.py::TestMistral7B::test_smooth_quant_tp4pp1
Expand Down Expand Up @@ -462,6 +465,10 @@ accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True] TIMEOUT (240)
accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency_trtllmgen]
Expand All @@ -479,6 +486,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency]
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype

test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/qa/llm_sanity_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_gr
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True]
accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,6 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K] SKIP (https://nvbugs/5325284)
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugspro.nvidia.com/bug/5312750)
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-70B] SKIP (https://nvbugs/5323316)
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160)
test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234)
Expand Down