diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 352f4ec2496..fd69a8e385a 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -540,16 +540,17 @@ def get_autotune_warmup_request(): available_blocks = kv_cache_manager.get_num_free_blocks() + maximum_tunable_num_tokens = min( + self.batch_size * num_tokens_per_request, self.max_num_tokens, + available_blocks * kv_cache_manager.tokens_per_block) + # Calculate number of full-length requests and remaining tokens # Each request has num_tokens_per_request tokens, except possibly the last one - full_len_request_num = self.max_num_tokens // num_tokens_per_request - remaining_tokens = self.max_num_tokens % num_tokens_per_request + full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request + remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1 - if self.max_num_tokens > available_blocks * kv_cache_manager.tokens_per_block: - return None, None - requests = kv_cache_manager.add_dummy_requests( request_ids=list(range(full_len_request_num)), token_nums=[num_tokens_per_request] * full_len_request_num, @@ -573,7 +574,7 @@ def get_autotune_warmup_request(): result.context_requests = requests result.generation_requests = [] - return result, _create_extra_inputs(1, self.max_num_tokens) + return result, _create_extra_inputs(1, maximum_tunable_num_tokens) @contextlib.contextmanager def release_batch(result): diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index d8d0e2979ab..9f02b68b0fc 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -224,18 +224,23 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, generation_overlap): # Test the case where the context worker capacity is exceeded and # needs to wait for the generation worker to complete. + # TODO: Autotuner is disabled due to illegal CUDA instruction error on H100. + # H200 does not have this issue, possibly due to a larger GPU memory. + # This should be investigated further. worker_pytorch_configs = [] # Context worker worker_pytorch_configs.append( PyTorchConfig(disable_overlap_scheduler=True, kv_cache_dtype="auto", + autotuner_enabled=False, use_cuda_graph=enable_cuda_graph)) # Generation worker worker_pytorch_configs.append( PyTorchConfig(disable_overlap_scheduler=not generation_overlap, kv_cache_dtype="auto", + autotuner_enabled=False, use_cuda_graph=enable_cuda_graph)) kv_cache_configs = [KvCacheConfig(max_tokens=128) for _ in range(2)]