Skip to content

Commit c0c6398

Browse files
committed
test: add test case when we only open chunked prefilled
Signed-off-by: Mingyang Jiang <[email protected]>
1 parent 347a669 commit c0c6398

File tree

3 files changed

+11
-6
lines changed

3 files changed

+11
-6
lines changed

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,8 +787,9 @@ def disable_optimization(backend: Backend):
787787

788788
def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
789789
enable_paged_context_mla = is_mla(
790-
self.model.model_config.pretrained_config
791-
) and self.attn_runtime_features.cache_reuse
790+
self.model.model_config.pretrained_config) and (
791+
self.attn_runtime_features.cache_reuse
792+
or self.attn_runtime_features.chunked_prefill)
792793
if kv_cache_manager is None:
793794
return self.attn_backend.Metadata(
794795
max_num_requests=self.batch_size,

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -996,16 +996,18 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
996996
(False, False, False, True),
997997
(False, True, True, True), (True, True, True, True)])
998998
@parametrize_with_ids("mtp_nextn", [0, 2])
999+
@parametrize_with_ids("kv_cache_reuse", [True, False])
9991000
@parametrize_with_ids(
10001001
"quant_dtype",
10011002
[
1002-
pytest.param("none", marks=skip_pre_hopper),
1003+
pytest.param("none", marks=skip_pre_blackwell),
10031004
# pytest.param("fp8", marks=skip_pre_hopper),
10041005
# pytest.param("nvfp4", marks=skip_pre_blackwell)
10051006
])
10061007
# currently, chunked prefill is not supported for fp8 and nvfp4
1007-
def test_chunked_prefill(self, quant_dtype, mtp_nextn, fp8kv, attention_dp,
1008-
cuda_graph, overlap_scheduler):
1008+
def test_chunked_prefill(self, quant_dtype, mtp_nextn, kv_cache_reuse,
1009+
fp8kv, attention_dp, cuda_graph,
1010+
overlap_scheduler):
10091011
if quant_dtype == "nvfp4" and mtp_nextn > 0:
10101012
pytest.skip("MTP is not supported for NVFP4")
10111013
if fp8kv:
@@ -1018,7 +1020,7 @@ def test_chunked_prefill(self, quant_dtype, mtp_nextn, fp8kv, attention_dp,
10181020
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
10191021

10201022
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
1021-
enable_block_reuse=True)
1023+
enable_block_reuse=kv_cache_reuse)
10221024
pytorch_config = dict(
10231025
disable_overlap_scheduler=not overlap_scheduler,
10241026
use_cuda_graph=cuda_graph,

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ l0_b200:
4040
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
4141
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
4242
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
43+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-kv_cache_reuse=True]
44+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-kv_cache_reuse=False]
4345
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
4446
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
4547
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]

0 commit comments

Comments
 (0)