test: add test case when we only open chunked prefilled

jmydurant · jmydurant · commit c0c6398f505a · 2025-06-16T12:45:23.000+08:00
Signed-off-by: Mingyang Jiang &lt;13463932+jmydurant@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -787,8 +787,9 @@ def disable_optimization(backend: Backend):
 
     def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
         enable_paged_context_mla = is_mla(
-            self.model.model_config.pretrained_config
-        ) and self.attn_runtime_features.cache_reuse
+            self.model.model_config.pretrained_config) and (
+                self.attn_runtime_features.cache_reuse
+                or self.attn_runtime_features.chunked_prefill)
         if kv_cache_manager is None:
             return self.attn_backend.Metadata(
                 max_num_requests=self.batch_size,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -996,16 +996,18 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
                            (False, False, False, True),
                            (False, True, True, True), (True, True, True, True)])
     @parametrize_with_ids("mtp_nextn", [0, 2])
+    @parametrize_with_ids("kv_cache_reuse", [True, False])
     @parametrize_with_ids(
         "quant_dtype",
         [
-            pytest.param("none", marks=skip_pre_hopper),
+            pytest.param("none", marks=skip_pre_blackwell),
             # pytest.param("fp8", marks=skip_pre_hopper),
             # pytest.param("nvfp4", marks=skip_pre_blackwell)
         ])
     # currently, chunked prefill is not supported for fp8 and nvfp4
-    def test_chunked_prefill(self, quant_dtype, mtp_nextn, fp8kv, attention_dp,
-                             cuda_graph, overlap_scheduler):
+    def test_chunked_prefill(self, quant_dtype, mtp_nextn, kv_cache_reuse,
+                             fp8kv, attention_dp, cuda_graph,
+                             overlap_scheduler):
         if quant_dtype == "nvfp4" and mtp_nextn > 0:
             pytest.skip("MTP is not supported for NVFP4")
         if fp8kv:
@@ -1018,7 +1020,7 @@ def test_chunked_prefill(self, quant_dtype, mtp_nextn, fp8kv, attention_dp,
             model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
-                                        enable_block_reuse=True)
+                                        enable_block_reuse=kv_cache_reuse)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -40,6 +40,8 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-kv_cache_reuse=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-kv_cache_reuse=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]