Skip to content

Commit 9e50514

Browse files
committed
add TestNemotronH_47B and TestNemotronH_56B
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent c589ece commit 9e50514

File tree

2 files changed

+74
-2
lines changed

2 files changed

+74
-2
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,8 +1588,6 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
15881588

15891589
@parametrize_with_ids("cuda_graph", [False, True])
15901590
def test_auto_dtype(self, cuda_graph):
1591-
# TODO: remove max_batch_size after mamba cache manager is supported
1592-
# ToDo: check 47b and 56b model
15931591
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
15941592
with LLM(self.MODEL_PATH,
15951593
kv_cache_config=kv_cache_config,
@@ -1615,6 +1613,77 @@ def test_reasoning_fp8_prequantized(self, cuda_graph):
16151613
task.evaluate(llm)
16161614

16171615

1616+
@pytest.mark.skip_less_device(8)
1617+
@pytest.mark.skip_less_device_memory(80000)
1618+
class TestNemotronH_47B_Base(LlmapiAccuracyTestHarness):
1619+
MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1620+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-47B-Base-8K"
1621+
1622+
@parametrize_with_ids("cuda_graph", [False, True])
1623+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1624+
(8, 1, 8)],
1625+
ids=["tp8", "tp8ep4", "tp8ep8"])
1626+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1627+
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
1628+
with LLM(self.MODEL_PATH,
1629+
tensor_parallel_size=tp_size,
1630+
pipeline_parallel_size=pp_size,
1631+
moe_expert_parallel_size=ep_size,
1632+
kv_cache_config=kv_cache_config,
1633+
cuda_graph_config=CudaGraphConfig()
1634+
if cuda_graph else None) as llm:
1635+
task = MMLU(self.MODEL_NAME)
1636+
task.evaluate(llm)
1637+
task = GSM8K(self.MODEL_NAME)
1638+
task.evaluate(llm)
1639+
1640+
@skip_pre_hopper
1641+
@parametrize_with_ids("cuda_graph", [False, True])
1642+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1643+
(8, 1, 8)],
1644+
ids=["tp8", "tp8ep4", "tp8ep8"])
1645+
def test_reasoning_fp8_prequantized(self, cuda_graph, tp_size, pp_size,
1646+
ep_size):
1647+
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
1648+
with LLM(f"{llm_models_root()}/Nemotron-H-47B-Reasoning-128K-FP8",
1649+
kv_cache_config=kv_cache_config,
1650+
tensor_parallel_size=tp_size,
1651+
pipeline_parallel_size=pp_size,
1652+
moe_expert_parallel_size=ep_size,
1653+
cuda_graph_config=CudaGraphConfig()
1654+
if cuda_graph else None) as llm:
1655+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
1656+
task = MMLU(self.MODEL_NAME)
1657+
task.evaluate(llm)
1658+
task = GSM8K(self.MODEL_NAME)
1659+
task.evaluate(llm)
1660+
1661+
1662+
@pytest.mark.skip_less_device(8)
1663+
@pytest.mark.skip_less_device_memory(80000)
1664+
class TestNemotronH_56B_Base(LlmapiAccuracyTestHarness):
1665+
MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1666+
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-56B-Base-8K"
1667+
1668+
@parametrize_with_ids("cuda_graph", [False, True])
1669+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
1670+
(8, 1, 8)],
1671+
ids=["tp8", "tp8ep4", "tp8ep8"])
1672+
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
1673+
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
1674+
with LLM(self.MODEL_PATH,
1675+
tensor_parallel_size=tp_size,
1676+
pipeline_parallel_size=pp_size,
1677+
moe_expert_parallel_size=ep_size,
1678+
kv_cache_config=kv_cache_config,
1679+
cuda_graph_config=CudaGraphConfig()
1680+
if cuda_graph else None) as llm:
1681+
task = MMLU(self.MODEL_NAME)
1682+
task.evaluate(llm)
1683+
task = GSM8K(self.MODEL_NAME)
1684+
task.evaluate(llm)
1685+
1686+
16181687
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
16191688
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
16201689
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"

tests/integration/test_lists/qa/benchmark_test_list.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
1515
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
1616
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
1717
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
18+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
19+
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
20+
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
1821
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
1922
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
2023
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]

0 commit comments

Comments
 (0)