diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index dbbd6eb79f4..a0086fc2a47 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -96,6 +96,16 @@ nvidia/Nemotron-H-8B-Base-8K: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 85.78 +nvidia/Nemotron-H-47B-Base-8K: + - accuracy: 88.82 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 88.55 +nvidia/Nemotron-H-56B-Base-8K: + - accuracy: 89.27 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 89.27 nvidia/Llama-3.1-Nemotron-Nano-8B-v1: - accuracy: 37.15 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index d86ebb0ce39..4e91d222bf0 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -185,6 +185,16 @@ nvidia/Nemotron-H-8B-Base-8K: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 69.180 +nvidia/Nemotron-H-47B-Base-8K: + - accuracy: 83.26 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 82.68 +nvidia/Nemotron-H-56B-Base-8K: + - accuracy: 83.82 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ce1e1cc1367..0dae6f7e97a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1602,24 +1602,30 @@ class TestNemotronH(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K" MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K" - def test_auto_dtype(self): + @parametrize_with_ids("cuda_graph", [False, True]) + def test_auto_dtype(self, cuda_graph): # TODO: remove max_batch_size after mamba cache manager is supported - # ToDo: check 47b and 56b model + # Once removed max_batch_size, the test will OOM kv_cache_config = KvCacheConfig(enable_block_reuse=False) with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, - max_batch_size=128) as llm: + max_batch_size=128, + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @skip_pre_ada - def test_reasoning_fp8_prequantized(self): + @parametrize_with_ids("cuda_graph", [False, True]) + def test_reasoning_fp8_prequantized(self, cuda_graph): kv_cache_config = KvCacheConfig(enable_block_reuse=False) with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8", kv_cache_config=kv_cache_config, - max_batch_size=256) as llm: + max_batch_size=256, + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 task = MMLU(self.MODEL_NAME) task.evaluate(llm) @@ -1627,6 +1633,83 @@ def test_reasoning_fp8_prequantized(self): task.evaluate(llm) +@pytest.mark.skip_less_device(8) +@pytest.mark.skip_less_device_memory(80000) +class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness): + MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K" + MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K" + + @parametrize_with_ids("cuda_graph", [False, True]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), + (8, 1, 8)], + ids=["tp8", "tp8ep4", "tp8ep8"]) + def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): + kv_cache_config = KvCacheConfig(enable_block_reuse=False, + free_gpu_memory_fraction=0.6) + with LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + max_batch_size=256, + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @parametrize_with_ids("cuda_graph", [False, True]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), + (8, 1, 8)], + ids=["tp8", "tp8ep4", "tp8ep8"]) + def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size, + ep_size): + kv_cache_config = KvCacheConfig(enable_block_reuse=False, + free_gpu_memory_fraction=0.6) + with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8", + kv_cache_config=kv_cache_config, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + max_batch_size=256, + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + +@pytest.mark.skip_less_device(8) +@pytest.mark.skip_less_device_memory(80000) +class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness): + MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K" + MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K" + + @parametrize_with_ids("cuda_graph", [False, True]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), + (8, 1, 8)], + ids=["tp8", "tp8ep4", "tp8ep8"]) + def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): + kv_cache_config = KvCacheConfig(enable_block_reuse=False, + free_gpu_memory_fraction=0.6) + with LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + max_batch_size=256, + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-7B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct" diff --git a/tests/integration/test_lists/qa/benchmark_test_list.txt b/tests/integration/test_lists/qa/benchmark_test_list.txt index 45f223a326e..4fe1fd3ab00 100644 --- a/tests/integration/test_lists/qa/benchmark_test_list.txt +++ b/tests/integration/test_lists/qa/benchmark_test_list.txt @@ -12,8 +12,12 @@ accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_gra accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]