NVIDIA · StanleySun639 · Jun 17, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -148,6 +148,11 @@ meta-llama/Llama-3.2-1B:
     accuracy: 27.259
   - extra_acc_spec: max_attention_window_size=960;beam_width=4
     accuracy: 0
+meta-llama/Llama-3.2-3B:
+  - accuracy: 25.495
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 33.629
 meta-llama/Llama-3.3-70B-Instruct:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8

diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -77,6 +77,9 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
     accuracy: 92.42
 nvidia/Nemotron-H-8B-Base-8K:
   - accuracy: 46.20
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 85.78
 nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
   - accuracy: 37.15
   - quant_algo: FP8
@@ -87,3 +90,5 @@ nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 94.16
+kanana-1.5-2.1b-instruct-2505:
+  - accuracy: 75.81
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -52,6 +52,11 @@ meta-llama/Llama-3.2-1B:
     accuracy: 33.87
   - extra_acc_spec: max_attention_window_size=960
     accuracy: 32.82
+meta-llama/Llama-3.2-3B:
+  - accuracy: 57.92
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 60.60
 meta-llama/Llama-3.3-70B-Instruct:
   - accuracy: 81.31
   - quant_algo: NVFP4
@@ -162,10 +167,15 @@ nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
     accuracy: 57.12
 nvidia/Nemotron-H-8B-Base-8K:
   - accuracy: 69.590
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 69.180
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
 nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
   - accuracy: 83.70
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 83.36
+kanana-1.5-2.1b-instruct-2505:
+  - accuracy: 56.89
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import pytest
 
-from tensorrt_llm.llmapi import LLM, EagleDecodingConfig
+from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -74,6 +74,79 @@ def test_guided_decoding_4gpus(self):
             task.evaluate(llm)
 
 
+class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "meta-llama/Llama-3.2-1B"
+    MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B"
+    EXAMPLE_FOLDER = "models/core/llama"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_post_blackwell
+    def test_smooth_quant(self):
+        quant_config = QuantConfig(
+            QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_post_blackwell
+    def test_smooth_quant_ootb(self):
+        quant_config = QuantConfig(QuantAlgo.W8A8_SQ_PER_CHANNEL)
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_post_blackwell
+    def test_int4_awq(self):
+        quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_post_blackwell
+    def test_int4_awq_int8_kv_cache(self):
+        quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
+        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.INT8)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 kv_cache_config=kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    def test_fp8(self):
+        quant_config = QuantConfig(QuantAlgo.FP8)
+        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 kv_cache_config=kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    @pytest.mark.skip_less_device(2)
+    def test_fp8_pp2(self):
+        quant_config = QuantConfig(QuantAlgo.FP8)
+        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
+        with LLM(self.MODEL_PATH,
+                 pipeline_parallel_size=2,
+                 quant_config=quant_config,
+                 kv_cache_config=kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_ada
+    @skip_post_blackwell
+    def test_fp8_rowwise(self):
+        quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestMistral7B_0_3(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
     MODEL_PATH = f"{llm_models_root()}/Mistral-7B-Instruct-v0.3"

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -300,66 +300,36 @@ def test_auto_dtype(self):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_post_blackwell
-    def test_smooth_quant(self):
-        quant_config = QuantConfig(
-            QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
-        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+    def test_fp8_prequantized(self):
+        model_path = f"{llm_models_root()}/ llama-3.2-models/Llama-3.2-1B-FP8"
+        with LLM(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_post_blackwell
-    def test_smooth_quant_ootb(self):
-        quant_config = QuantConfig(QuantAlgo.W8A8_SQ_PER_CHANNEL)
-        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
 
-    @skip_post_blackwell
-    def test_int4_awq(self):
-        quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
-        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
+class TestLlama3_2_3B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "meta-llama/Llama-3.2-3B"
+    MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-3B"
+    EXAMPLE_FOLDER = "models/core/llama"
 
-    @skip_post_blackwell
-    def test_int4_awq_int8_kv_cache(self):
-        quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
-        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.INT8)
-        with LLM(self.MODEL_PATH,
-                 quant_config=quant_config,
-                 kv_cache_config=kv_cache_config) as llm:
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
-
-    @skip_pre_ada
-    def test_fp8(self):
-        quant_config = QuantConfig(QuantAlgo.FP8)
-        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
-        with LLM(self.MODEL_PATH,
-                 quant_config=quant_config,
-                 kv_cache_config=kv_cache_config) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
+            task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_pre_ada
-    @pytest.mark.skip_less_device(2)
-    def test_fp8_pp2(self):
-        quant_config = QuantConfig(QuantAlgo.FP8)
-        kv_cache_config = KvCacheConfig(quant_algo=QuantAlgo.FP8)
-        with LLM(self.MODEL_PATH,
-                 pipeline_parallel_size=2,
-                 quant_config=quant_config,
-                 kv_cache_config=kv_cache_config) as llm:
+    @skip_pre_hopper
+    def test_fp8_prequantized(self):
+        model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-3B-Instruct-FP8"
+        with LLM(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
-
-    @skip_pre_ada
-    @skip_post_blackwell
-    def test_fp8_rowwise(self):
-        quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
-        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
+            task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
 
@@ -1250,6 +1220,19 @@ def test_auto_dtype(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_ada
+    def test_reasoning_fp8_prequantized(self):
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+        with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=256) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
@@ -1476,3 +1459,21 @@ def test_auto_dtype(self):
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
+
+
+class TestKanana_Instruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "kanana-1.5-2.1b-instruct-2505"
+    MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505"
+
+    @pytest.mark.skip_not_contain(["H20", "H100"])
+    def test_auto_dtype(self):
+        "RCCA: https://nvbugspro.nvidia.com/bug/5310520"
+        pytorch_config = dict(duse_cuda_graph=True,
+                              cuda_graph_padding_enabled=True,
+                              cuda_graph_max_batch_size=384)
+        with LLM(self.MODEL_PATH, **pytorch_config,
+                 enable_attention_dp=True) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -376,13 +376,16 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_weight_streaming[1.0]
 accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache
 accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
 accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_smooth_quant
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_smooth_quant_ootb
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_int4_awq
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_pp2
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_rowwise
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_auto_dtype
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant_ootb
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
+accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
 accuracy/test_cli_flow.py::TestMistral7B::test_beam_search
 accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2
 accuracy/test_cli_flow.py::TestMistral7B::test_smooth_quant_tp4pp1
@@ -462,6 +465,10 @@ accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
+accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True] TIMEOUT (240)
+accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency_trtllmgen]
@@ -479,6 +486,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[throughput_latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency]
+accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt
@@ -134,6 +134,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_gr
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
 accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True]
 accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -407,7 +407,6 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K] SKIP (https://nvbugs/5325284)
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugspro.nvidia.com/bug/5312750)
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-70B] SKIP (https://nvbugs/5323316)
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160)
 test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234)