Skip to content

Commit 86ce03f

Browse files
fix: Update the KVBM <> TRT-LLM integration interface to match the latest TRT-LLM connector API (#2979)
Signed-off-by: richardhuo-nv <[email protected]>
1 parent cd81437 commit 86ce03f

File tree

5 files changed

+40
-24
lines changed

5 files changed

+40
-24
lines changed

components/backends/trtllm/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,3 +303,9 @@ sampling_params.logits_processor = create_trtllm_adapters(processors)
303303
## Performance Sweep
304304

305305
For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](./performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance.
306+
307+
## Dynamo KV Block Manager Integration
308+
309+
Dynamo with TensorRT-LLM currently supports integration with the Dynamo KV Block Manager. This integration can significantly reduce time-to-first-token (TTFT) latency, particularly in usage patterns such as multi-turn conversations and repeated long-context requests.
310+
311+
Here is the instruction: [Running KVBM in TensorRT-LLM](./../../../docs/guides/run_kvbm_in_trtllm.md) .

container/Dockerfile.trtllm

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
55
ARG BASE_IMAGE_TAG="25.06-py3"
66
ARG RELEASE_BUILD
7+
ARG ENABLE_KVBM=false
78
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
89
ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
910

@@ -234,6 +235,7 @@ ARG ARCH_ALT
234235
FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
235236
ARG RELEASE_BUILD
236237
ARG CARGO_BUILD_JOBS
238+
ARG ENABLE_KVBM
237239
# Set CARGO_BUILD_JOBS to 16 if not provided
238240
# This is to prevent cargo from building $(nproc) jobs in parallel,
239241
# which might exceed the number of opened files limit.
@@ -279,16 +281,21 @@ COPY launch /workspace/launch
279281
RUN cargo build \
280282
--release \
281283
--locked \
282-
--features dynamo-llm/block-manager \
284+
--features block-manager \
283285
--workspace
284286

285287
# Build dynamo wheels
286288
RUN uv build --wheel --out-dir /workspace/dist && \
287289
cd /workspace/lib/bindings/python && \
288-
uv build --wheel --out-dir /workspace/dist --python 3.12 && \
290+
uv pip install maturin[patchelf] && \
291+
if [ "$ENABLE_KVBM" = "true" ]; then \
292+
maturin build --release --features block-manager --out /workspace/dist; \
293+
else \
294+
maturin build --release --out /workspace/dist; \
295+
fi && \
289296
if [ "$RELEASE_BUILD" = "true" ]; then \
290-
uv build --wheel --out-dir /workspace/dist --python 3.11 && \
291-
uv build --wheel --out-dir /workspace/dist --python 3.10; \
297+
uv run --python 3.11 maturin build --release --out /workspace/dist && \
298+
uv run --python 3.10 maturin build --release --out /workspace/dist; \
292299
fi
293300

294301
########################################

docs/guides/run_kvbm_in_trtllm.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,12 @@ To use KVBM in TensorRT-LLM, you can follow the steps below:
3838
# start up etcd for KVBM leader/worker registration and discovery
3939
docker compose -f deploy/docker-compose.yml up -d
4040

41-
# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer.
42-
./container/build.sh --framework trtllm --tensorrtllm-commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 --enable-kvbm
41+
# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit dcd110cfac07e577ce01343c455917832b0f3d5e or newer.
42+
# When building with the --tensorrtllm-commit option, you may notice that https://github.com keeps prompting for a username and password.
43+
# This happens because cloning TensorRT-LLM can hit GitHub’s rate limit.
44+
# To work around this, you can keep pressing "Enter" or "Return.".
45+
# Setting "export GIT_LFS_SKIP_SMUDGE=1" may also reduce the number of prompts.
46+
./container/build.sh --framework trtllm --tensorrtllm-commit dcd110cfac07e577ce01343c455917832b0f3d5e --enable-kvbm
4347

4448
# launch the container
4549
./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds

lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
KvCacheConnectorScheduler,
99
SchedulerOutput,
1010
)
11-
from tensorrt_llm.bindings.executor import ExecutorConfig
1211
from tensorrt_llm.bindings.internal.batch_manager import LlmRequest
12+
from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
1313

1414
from dynamo.llm import KvbmLeader
1515
from dynamo.llm.trtllm_integration.rust import KvbmRequest
@@ -21,21 +21,21 @@
2121

2222

2323
class DynamoKVBMConnectorLeader(KvCacheConnectorScheduler):
24-
def __init__(self, executor_config: ExecutorConfig):
25-
super().__init__(executor_config)
24+
def __init__(self, llm_args: TorchLlmArgs):
25+
super().__init__(llm_args)
2626
self.drt = DistributedRuntime.detached()
2727

28-
world_size = self._config.mapping.world_size
29-
self.block_size = self._config.tokens_per_block
28+
mappings = self._llm_args.parallel_config.to_mapping()
29+
30+
world_size = mappings.world_size
31+
self.block_size = self._llm_args.kv_cache_config.tokens_per_block
3032

3133
# Set bytes_per_block to 0, because we will retrieve the actual value from the worker side.
3234
leader = KvbmLeader(world_size, drt=self.drt)
3335

34-
print(
35-
f"KvConnectorLeader initialized with rank: {executor_config.mapping.rank}"
36-
)
36+
print(f"KvConnectorLeader initialized with rank: {mappings.rank}")
3737
self._connector = RustKvConnectorLeader(
38-
executor_config.mapping.rank, self.drt, self.block_size, leader
38+
mappings.rank, self.drt, self.block_size, leader
3939
)
4040

4141
def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:

lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_worker.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import torch
55
from tensorrt_llm import logger
66
from tensorrt_llm._torch.pyexecutor.kv_cache_connector import KvCacheConnectorWorker
7-
from tensorrt_llm.bindings.executor import ExecutorConfig
7+
from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
88

99
from dynamo.llm.trtllm_integration.rust import (
1010
KvConnectorWorker as RustKvConnectorWorker,
@@ -13,16 +13,15 @@
1313

1414

1515
class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
16-
def __init__(self, executor_config: ExecutorConfig):
17-
super().__init__(executor_config)
16+
def __init__(self, llm_args: TorchLlmArgs):
17+
super().__init__(llm_args)
1818

1919
self.drt = DistributedRuntime.detached()
2020

21-
self.rank = executor_config.mapping.rank
21+
mappings = self._llm_args.parallel_config.to_mapping()
22+
self.rank = mappings.rank
2223

23-
self._connector = RustKvConnectorWorker(
24-
self.drt, str(executor_config.mapping.rank)
25-
)
24+
self._connector = RustKvConnectorWorker(self.drt, str(self.rank))
2625

2726
def register_kv_caches(self, kv_cache_tensor: torch.Tensor):
2827
"""
@@ -33,11 +32,11 @@ def register_kv_caches(self, kv_cache_tensor: torch.Tensor):
3332
"""
3433
print(f"Register KV Caches on rank {self.rank}")
3534
logger.info(
36-
f"KvConnectorWorker started registering the kv caches on rank {self._config.mapping.rank}"
35+
f"KvConnectorWorker started registering the kv caches on rank {self.rank}"
3736
)
3837

3938
num_device_blocks = kv_cache_tensor.shape[0]
40-
page_size = self._config.tokens_per_block
39+
page_size = self._llm_args.kv_cache_config.tokens_per_block
4140
device_id = kv_cache_tensor.device.index
4241
kv_cache_dtype = kv_cache_tensor.dtype
4342

0 commit comments

Comments
 (0)