Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,16 +540,17 @@ def get_autotune_warmup_request():

available_blocks = kv_cache_manager.get_num_free_blocks()

maximum_tunable_num_tokens = min(
self.batch_size * num_tokens_per_request, self.max_num_tokens,
available_blocks * kv_cache_manager.tokens_per_block)

# Calculate number of full-length requests and remaining tokens
# Each request has num_tokens_per_request tokens, except possibly the last one
full_len_request_num = self.max_num_tokens // num_tokens_per_request
remaining_tokens = self.max_num_tokens % num_tokens_per_request
full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request
remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request

request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1

if self.max_num_tokens > available_blocks * kv_cache_manager.tokens_per_block:
return None, None

requests = kv_cache_manager.add_dummy_requests(
request_ids=list(range(full_len_request_num)),
token_nums=[num_tokens_per_request] * full_len_request_num,
Expand All @@ -573,7 +574,7 @@ def get_autotune_warmup_request():
result.context_requests = requests
result.generation_requests = []

return result, _create_extra_inputs(1, self.max_num_tokens)
return result, _create_extra_inputs(1, maximum_tunable_num_tokens)

@contextlib.contextmanager
def release_batch(result):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,18 +224,23 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
generation_overlap):
# Test the case where the context worker capacity is exceeded and
# needs to wait for the generation worker to complete.
# TODO: Autotuner is disabled due to illegal CUDA instruction error on H100.
# H200 does not have this issue, possibly due to a larger GPU memory.
# This should be investigated further.
worker_pytorch_configs = []

# Context worker
worker_pytorch_configs.append(
PyTorchConfig(disable_overlap_scheduler=True,
kv_cache_dtype="auto",
autotuner_enabled=False,
use_cuda_graph=enable_cuda_graph))

# Generation worker
worker_pytorch_configs.append(
PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
kv_cache_dtype="auto",
autotuner_enabled=False,
use_cuda_graph=enable_cuda_graph))

kv_cache_configs = [KvCacheConfig(max_tokens=128) for _ in range(2)]
Expand Down