@@ -1588,8 +1588,6 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
15881588
15891589 @parametrize_with_ids ("cuda_graph" , [False , True ])
15901590 def test_auto_dtype (self , cuda_graph ):
1591- # TODO: remove max_batch_size after mamba cache manager is supported
1592- # ToDo: check 47b and 56b model
15931591 kv_cache_config = KvCacheConfig (enable_block_reuse = False )
15941592 with LLM (self .MODEL_PATH ,
15951593 kv_cache_config = kv_cache_config ,
@@ -1615,6 +1613,77 @@ def test_reasoning_fp8_prequantized(self, cuda_graph):
16151613 task .evaluate (llm )
16161614
16171615
1616+ @pytest .mark .skip_less_device (8 )
1617+ @pytest .mark .skip_less_device_memory (80000 )
1618+ class TestNemotronH_47B_Base (LlmapiAccuracyTestHarness ):
1619+ MODEL_NAME = "nvidia/Nemotron-H-47B-Base-8K"
1620+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-47B-Base-8K"
1621+
1622+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1623+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1624+ (8 , 1 , 8 )],
1625+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1626+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1627+ kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1628+ with LLM (self .MODEL_PATH ,
1629+ tensor_parallel_size = tp_size ,
1630+ pipeline_parallel_size = pp_size ,
1631+ moe_expert_parallel_size = ep_size ,
1632+ kv_cache_config = kv_cache_config ,
1633+ cuda_graph_config = CudaGraphConfig ()
1634+ if cuda_graph else None ) as llm :
1635+ task = MMLU (self .MODEL_NAME )
1636+ task .evaluate (llm )
1637+ task = GSM8K (self .MODEL_NAME )
1638+ task .evaluate (llm )
1639+
1640+ @skip_pre_hopper
1641+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1642+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1643+ (8 , 1 , 8 )],
1644+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1645+ def test_reasoning_fp8_prequantized (self , cuda_graph , tp_size , pp_size ,
1646+ ep_size ):
1647+ kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1648+ with LLM (f"{ llm_models_root ()} /Nemotron-H-47B-Reasoning-128K-FP8" ,
1649+ kv_cache_config = kv_cache_config ,
1650+ tensor_parallel_size = tp_size ,
1651+ pipeline_parallel_size = pp_size ,
1652+ moe_expert_parallel_size = ep_size ,
1653+ cuda_graph_config = CudaGraphConfig ()
1654+ if cuda_graph else None ) as llm :
1655+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
1656+ task = MMLU (self .MODEL_NAME )
1657+ task .evaluate (llm )
1658+ task = GSM8K (self .MODEL_NAME )
1659+ task .evaluate (llm )
1660+
1661+
1662+ @pytest .mark .skip_less_device (8 )
1663+ @pytest .mark .skip_less_device_memory (80000 )
1664+ class TestNemotronH_56B_Base (LlmapiAccuracyTestHarness ):
1665+ MODEL_NAME = "nvidia/Nemotron-H-56B-Base-8K"
1666+ MODEL_PATH = f"{ llm_models_root ()} /Nemotron-H-56B-Base-8K"
1667+
1668+ @parametrize_with_ids ("cuda_graph" , [False , True ])
1669+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
1670+ (8 , 1 , 8 )],
1671+ ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
1672+ def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
1673+ kv_cache_config = KvCacheConfig (enable_block_reuse = False )
1674+ with LLM (self .MODEL_PATH ,
1675+ tensor_parallel_size = tp_size ,
1676+ pipeline_parallel_size = pp_size ,
1677+ moe_expert_parallel_size = ep_size ,
1678+ kv_cache_config = kv_cache_config ,
1679+ cuda_graph_config = CudaGraphConfig ()
1680+ if cuda_graph else None ) as llm :
1681+ task = MMLU (self .MODEL_NAME )
1682+ task .evaluate (llm )
1683+ task = GSM8K (self .MODEL_NAME )
1684+ task .evaluate (llm )
1685+
1686+
16181687class TestQwen2_7BInstruct (LlmapiAccuracyTestHarness ):
16191688 MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
16201689 MODEL_PATH = f"{ llm_models_root ()} /Qwen2-7B-Instruct"
0 commit comments