Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ nvidia/Nemotron-H-8B-Base-8K:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 85.78
nvidia/Nemotron-H-47B-Base-8K:
- accuracy: 88.82
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 88.55
nvidia/Nemotron-H-56B-Base-8K:
- accuracy: 89.27
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 89.27
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 37.15
- quant_algo: FP8
Expand Down
10 changes: 10 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,16 @@ nvidia/Nemotron-H-8B-Base-8K:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 69.180
nvidia/Nemotron-H-47B-Base-8K:
- accuracy: 83.26
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 82.68
nvidia/Nemotron-H-56B-Base-8K:
- accuracy: 83.82
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 83.82
microsoft/Phi-4-mini-instruct:
- accuracy: 68.98
# Created a dummy accuracy to track tp_size=2 for phi4-mini model.
Expand Down
93 changes: 88 additions & 5 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1602,31 +1602,114 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"

def test_auto_dtype(self):
@parametrize_with_ids("cuda_graph", [False, True])
def test_auto_dtype(self, cuda_graph):
# TODO: remove max_batch_size after mamba cache manager is supported
# ToDo: check 47b and 56b model
# Once removed max_batch_size, the test will OOM
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
max_batch_size=128) as llm:
max_batch_size=128,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
def test_reasoning_fp8_prequantized(self):
@parametrize_with_ids("cuda_graph", [False, True])
def test_reasoning_fp8_prequantized(self, cuda_graph):
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
with LLM(f"{llm_models_root()}/Nemotron-H-8B-Reasoning-128K-FP8",
kv_cache_config=kv_cache_config,
max_batch_size=256) as llm:
max_batch_size=256,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"

@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
free_gpu_memory_fraction=0.6)
with LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
max_batch_size=256,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
ep_size):
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
free_gpu_memory_fraction=0.6)
with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
kv_cache_config=kv_cache_config,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
max_batch_size=256,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"

@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
free_gpu_memory_fraction=0.6)
with LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
max_batch_size=256,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)


class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
Expand Down
8 changes: 6 additions & 2 deletions tests/integration/test_lists/qa/benchmark_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_gra
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]