@@ -499,7 +499,7 @@ def test_eagle3_tp8(self, eagle3_one_model):
499499 @pytest .mark .skip_less_device (4 )
500500 @skip_pre_hopper
501501 def test_fp8_tp4 (self ):
502- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
502+ model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct-FP8 "
503503 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
504504 with LLM (model_path ,
505505 tensor_parallel_size = 4 ,
@@ -508,6 +508,7 @@ def test_fp8_tp4(self):
508508 kv_cache_config = kv_cache_config ) as llm :
509509 assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
510510 sampling_params = SamplingParams (
511+ max_tokens = 256 ,
511512 temperature = 0.0 ,
512513 add_special_tokens = False ,
513514 )
@@ -517,16 +518,20 @@ def test_fp8_tp4(self):
517518 task .evaluate (llm , sampling_params = sampling_params )
518519 task = GPQADiamond (self .MODEL_NAME )
519520 task .evaluate (llm ,
520- sampling_params = sampling_params ,
521521 extra_evaluator_kwargs = dict (apply_chat_template = True ))
522522
523523 @pytest .mark .skip_less_device (4 )
524524 @skip_pre_blackwell
525525 def test_nvfp4_tp4 (self ):
526- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
527- with LLM (model_path , tensor_parallel_size = 4 ) as llm :
526+ model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
527+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
528+ with LLM (model_path ,
529+ tensor_parallel_size = 4 ,
530+ max_batch_size = 32 ,
531+ kv_cache_config = kv_cache_config ) as llm :
528532 assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
529533 sampling_params = SamplingParams (
534+ max_tokens = 256 ,
530535 temperature = 0.0 ,
531536 add_special_tokens = False ,
532537 )
@@ -536,7 +541,6 @@ def test_nvfp4_tp4(self):
536541 task .evaluate (llm , sampling_params = sampling_params )
537542 task = GPQADiamond (self .MODEL_NAME )
538543 task .evaluate (llm ,
539- sampling_params = sampling_params ,
540544 extra_evaluator_kwargs = dict (apply_chat_template = True ))
541545
542546
0 commit comments