diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 850f27389b8..d9a64947b14 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -1,6 +1,6 @@ meta-llama/Llama-3.1-8B-Instruct: - accuracy: 74.20 - - spec_dec_algo: NGRAM + - spec_dec_algo: NGram accuracy: 74.20 - quant_algo: FP8 accuracy: 74.30 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 86a07220237..26ff0c23d80 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -20,9 +20,9 @@ meta-llama/Llama-3.1-8B: accuracy: 64.99 meta-llama/Llama-3.1-8B-Instruct: - accuracy: 68.17 - - spec_dec_algo: EAGLE3 + - spec_dec_algo: Eagle accuracy: 68.20 - - spec_dec_algo: NGRAM + - spec_dec_algo: NGram accuracy: 68.17 - quant_algo: FP8 accuracy: 67.93 diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index 6033eae3b6a..f34bcdb5be4 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -137,7 +137,8 @@ def test_fp8_pp2(self): with LLM(self.MODEL_PATH, pipeline_parallel_size=2, quant_config=quant_config, - kv_cache_config=kv_cache_config) as llm: + kv_cache_config=kv_cache_config, + max_batch_size=64) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 6fd9ed09677..11a2ff3236a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -204,6 +204,7 @@ def test_fp8_llm_sampler(self): sampling_params=sampling_params, extra_acc_spec="temperature=0.8,top_p=0.95") + @skip_pre_hopper def test_fp8_beam_search(self): model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" pytorch_config = dict(disable_overlap_scheduler=True) @@ -228,6 +229,7 @@ def test_fp8_beam_search(self): sampling_params=sampling_params, extra_acc_spec="beam_width=4") + @skip_pre_hopper def test_eagle3(self): pytorch_config = dict( disable_overlap_scheduler=True, @@ -250,15 +252,18 @@ def test_eagle3(self): task = MMLU(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_hopper def test_ngram(self): - pytorch_config = dict(disable_overlap_scheduler=True) + pytorch_config = dict( + disable_overlap_scheduler=True, + cuda_graph_config=CudaGraphConfig(batch_sizes=[1]), + ) kv_cache_config = KvCacheConfig(enable_block_reuse=False) - draft_len = 4 spec_config = NGramDecodingConfig( - max_draft_len=draft_len, - max_matching_ngram_size=draft_len, + max_draft_len=4, + max_matching_ngram_size=2, is_keep_all=True, is_use_oldest=True, is_public_pool=True, @@ -267,7 +272,8 @@ def test_ngram(self): with LLM(model=self.MODEL_PATH, **pytorch_config, kv_cache_config=kv_cache_config, - speculative_config=spec_config) as llm: + speculative_config=spec_config, + max_batch_size=16) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -307,7 +313,7 @@ def test_auto_dtype(self): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) - @skip_pre_hopper + @skip_pre_ada def test_fp8_prequantized(self): model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B-FP8" with LLM(model_path) as llm: