diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md index f25d4bc487d..eee4ca7e155 100644 --- a/docs/source/torch/features/feature_combination_matrix.md +++ b/docs/source/torch/features/feature_combination_matrix.md @@ -8,8 +8,8 @@ | Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | | | Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | | | MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | | -| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | --- | | | | | | | | -| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Yes | No | No | --- | | | | | | | +| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | | +| EAGLE-3(Two Model Engine) | NO | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | | Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | | KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | | diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index 0ed814963df..93611de040b 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -349,13 +349,15 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, @pytest.mark.parametrize("model", ["Llama-3.1-8B-Instruct"]) @pytest.mark.parametrize("spec_dec_model_path", ["EAGLE3-LLaMA3.1-Instruct-8B"]) @pytest.mark.parametrize("generation_overlap", [False]) +@pytest.mark.parametrize("eagle3_one_model", [True, False]) def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path, - generation_overlap): + generation_overlap, + eagle3_one_model): # Test whether the batch slots are properly released when using speculative decoding # with disaggregated serving. spec_dec_config = EagleDecodingConfig( speculative_model_dir=model_path(spec_dec_model_path), - eagle3_one_model=False, + eagle3_one_model=eagle3_one_model, max_draft_len=3) worker_pytorch_configs = [] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index f3ab4b75238..481c5f709ff 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -85,7 +85,8 @@ l0_h100: - disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] - - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct] + - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[True-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct] + - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct] - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]