Add tests

Tabrizian · Tabrizian · commit 401b0d3667c0 · 2025-07-15T15:37:36.000-07:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -12,6 +12,7 @@
 from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MpiCommSession
+from tensorrt_llm.llmapi.llm_args import EagleDecodingConfig
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 MPI.pickle.__init__(
@@ -33,6 +34,11 @@ def model_path(model_name):
     elif 'TinyLlama-1.1B-Chat-v1.0' in model_name:
         return os.path.join(llm_models_root, 'llama-models-v2',
                             'TinyLlama-1.1B-Chat-v1.0')
+    elif 'Llama-3.1-8B-Instruct' in model_name:
+        return os.path.join(llm_models_root, 'llama-3.1-model',
+                            'Llama-3.1-8B-Instruct/')
+    elif 'EAGLE3-LLaMA3.1-Instruct-8B' in model_name:
+        return os.path.join(llm_models_root, 'EAGLE3-LLaMA3.1-Instruct-8B')
     else:
         raise ValueError(f"Unknown model: {model_name}")
 
@@ -317,5 +323,106 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
             print("All workers terminated.")
 
 
+@pytest.mark.parametrize("model", ["Llama-3.1-8B-Instruct"])
+@pytest.mark.parametrize("spec_dec_model_path", ["EAGLE3-LLaMA3.1-Instruct-8B"])
+@pytest.mark.parametrize("generation_overlap", [False])
+def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
+                                                 generation_overlap):
+    # Test whether the batch slots are properly released when using speculative decoding
+    # with disaggregated serving.
+    spec_dec_config = EagleDecodingConfig(
+        speculative_model_dir=model_path(spec_dec_model_path),
+        eagle3_one_model=False,
+        max_draft_len=3)
+
+    worker_pytorch_configs = []
+
+    # Context worker
+    worker_pytorch_configs.append(
+        dict(disable_overlap_scheduler=True,
+             kv_cache_dtype="auto",
+             speculative_config=spec_dec_config,
+             max_batch_size=1))
+
+    # Generation worker
+    worker_pytorch_configs.append(
+        dict(disable_overlap_scheduler=not generation_overlap,
+             kv_cache_dtype="auto",
+             speculative_config=spec_dec_config,
+             max_batch_size=1))
+
+    kv_cache_configs = [
+        KvCacheConfig(max_tokens=128, enable_block_reuse=False)
+        for _ in range(2)
+    ]
+    model_names = [model_path(model) for _ in range(2)]
+    ranks = [0, 1]
+    worker_args = list(
+        zip(kv_cache_configs, worker_pytorch_configs, model_names, ranks))
+
+    port_name = MPI.Open_port()
+    MPI.Publish_name('my_port', port_name)
+
+    prompt = "What is the capital of Germany?"
+
+    with MPIPoolExecutor(max_workers=2, env={"TRTLLM_USE_MPI_KVCACHE":
+                                             "1"}) as executor:
+        futures = []
+        try:
+            for worker_arg in worker_args:
+                future = executor.submit(worker_entry_point, *worker_arg)
+                futures.append(future)
+        except Exception as e:
+            print(f"Error in worker {worker_arg}: {e}")
+            raise e
+
+        try:
+            print("Launched all the workers.")
+            intercomm = MPI.COMM_SELF.Accept(port_name)
+
+            for _ in range(2):
+                intercomm.recv(tag=MPI_READY)
+                print("Received ready signal.")
+            max_tokens = 25
+
+            requests = []
+            for _ in range(10):
+                requests.append(
+                    (prompt, SamplingParams(max_tokens=1, ignore_eos=True),
+                     DisaggregatedParams(request_type="context_only")))
+
+            intercomm.send(requests, dest=0, tag=MPI_REQUEST)
+
+            for _ in range(len(requests)):
+                output = intercomm.recv(source=0, tag=MPI_RESULT)
+                assert output[0].disaggregated_params is not None
+                assert output[
+                    0].disaggregated_params.request_type == "context_only"
+                assert len(output[0].token_ids) == 1
+
+                generation_request_disagg_params = output[
+                    0].disaggregated_params
+                generation_request_disagg_params.request_type = "generation_only"
+                requests = []
+                requests.append((prompt,
+                                 SamplingParams(max_tokens=max_tokens,
+                                                ignore_eos=True),
+                                 generation_request_disagg_params))
+
+                intercomm.send(requests, dest=1, tag=MPI_REQUEST)
+                output = intercomm.recv(source=1, tag=MPI_RESULT)
+
+        finally:
+            # Send termination requests
+            intercomm.send(None, dest=0, tag=MPI_REQUEST)
+            intercomm.send(None, dest=1, tag=MPI_REQUEST)
+            print("Sent termination requests to the workers.")
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+            print("All workers terminated.")
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -66,6 +66,7 @@ l0_h100:
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8]
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
   - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
   - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]