NVIDIA · StanleySun639 · Jul 28, 2025 · Jul 28, 2025
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -1,6 +1,6 @@
 meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 74.20
-  - spec_dec_algo: NGRAM
+  - spec_dec_algo: NGram
     accuracy: 74.20
   - quant_algo: FP8
     accuracy: 74.30

diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -20,9 +20,9 @@ meta-llama/Llama-3.1-8B:
     accuracy: 64.99
 meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 68.17
-  - spec_dec_algo: EAGLE3
+  - spec_dec_algo: Eagle
     accuracy: 68.20
-  - spec_dec_algo: NGRAM
+  - spec_dec_algo: NGram
     accuracy: 68.17
   - quant_algo: FP8
     accuracy: 67.93

diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
@@ -137,7 +137,8 @@ def test_fp8_pp2(self):
         with LLM(self.MODEL_PATH,
                  pipeline_parallel_size=2,
                  quant_config=quant_config,
-                 kv_cache_config=kv_cache_config) as llm:
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=64) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -204,6 +204,7 @@ def test_fp8_llm_sampler(self):
                           sampling_params=sampling_params,
                           extra_acc_spec="temperature=0.8,top_p=0.95")
 
+    @skip_pre_hopper
     def test_fp8_beam_search(self):
         model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
         pytorch_config = dict(disable_overlap_scheduler=True)
@@ -228,6 +229,7 @@ def test_fp8_beam_search(self):
                           sampling_params=sampling_params,
                           extra_acc_spec="beam_width=4")
 
+    @skip_pre_hopper
     def test_eagle3(self):
         pytorch_config = dict(
             disable_overlap_scheduler=True,
@@ -250,15 +252,18 @@ def test_eagle3(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
     def test_ngram(self):
-        pytorch_config = dict(disable_overlap_scheduler=True)
+        pytorch_config = dict(
+            disable_overlap_scheduler=True,
+            cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
+        )
 
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
 
-        draft_len = 4
         spec_config = NGramDecodingConfig(
-            max_draft_len=draft_len,
-            max_matching_ngram_size=draft_len,
+            max_draft_len=4,
+            max_matching_ngram_size=2,
             is_keep_all=True,
             is_use_oldest=True,
             is_public_pool=True,
@@ -267,7 +272,8 @@ def test_ngram(self):
         with LLM(model=self.MODEL_PATH,
                  **pytorch_config,
                  kv_cache_config=kv_cache_config,
-                 speculative_config=spec_config) as llm:
+                 speculative_config=spec_config,
+                 max_batch_size=16) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
@@ -307,7 +313,7 @@ def test_auto_dtype(self):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_pre_hopper
+    @skip_pre_ada
     def test_fp8_prequantized(self):
         model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B-FP8"
         with LLM(model_path) as llm: