@@ -996,16 +996,18 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
996
996
(False , False , False , True ),
997
997
(False , True , True , True ), (True , True , True , True )])
998
998
@parametrize_with_ids ("mtp_nextn" , [0 , 2 ])
999
+ @parametrize_with_ids ("kv_cache_reuse" , [True , False ])
999
1000
@parametrize_with_ids (
1000
1001
"quant_dtype" ,
1001
1002
[
1002
- pytest .param ("none" , marks = skip_pre_hopper ),
1003
+ pytest .param ("none" , marks = skip_pre_blackwell ),
1003
1004
# pytest.param("fp8", marks=skip_pre_hopper),
1004
1005
# pytest.param("nvfp4", marks=skip_pre_blackwell)
1005
1006
])
1006
1007
# currently, chunked prefill is not supported for fp8 and nvfp4
1007
- def test_chunked_prefill (self , quant_dtype , mtp_nextn , fp8kv , attention_dp ,
1008
- cuda_graph , overlap_scheduler ):
1008
+ def test_chunked_prefill (self , quant_dtype , mtp_nextn , kv_cache_reuse ,
1009
+ fp8kv , attention_dp , cuda_graph ,
1010
+ overlap_scheduler ):
1009
1011
if quant_dtype == "nvfp4" and mtp_nextn > 0 :
1010
1012
pytest .skip ("MTP is not supported for NVFP4" )
1011
1013
if fp8kv :
@@ -1018,7 +1020,7 @@ def test_chunked_prefill(self, quant_dtype, mtp_nextn, fp8kv, attention_dp,
1018
1020
model_path = f"{ llm_models_root ()} /DeepSeek-V3-Lite/nvfp4_moe_only"
1019
1021
1020
1022
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
1021
- enable_block_reuse = True )
1023
+ enable_block_reuse = kv_cache_reuse )
1022
1024
pytorch_config = dict (
1023
1025
disable_overlap_scheduler = not overlap_scheduler ,
1024
1026
use_cuda_graph = cuda_graph ,
0 commit comments