NVIDIA · litaotju · Jun 6, 2025 · Jun 4, 2025
@@ -540,16 +540,17 @@ def get_autotune_warmup_request():
 
             available_blocks = kv_cache_manager.get_num_free_blocks()
 
+            maximum_tunable_num_tokens = min(
+                self.batch_size * num_tokens_per_request, self.max_num_tokens,
+                available_blocks * kv_cache_manager.tokens_per_block)
+
             # Calculate number of full-length requests and remaining tokens
             # Each request has num_tokens_per_request tokens, except possibly the last one
-            full_len_request_num = self.max_num_tokens // num_tokens_per_request
-            remaining_tokens = self.max_num_tokens % num_tokens_per_request
+            full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request
+            remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request
 
             request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1
 
-            if self.max_num_tokens > available_blocks * kv_cache_manager.tokens_per_block:
-                return None, None
-
             requests = kv_cache_manager.add_dummy_requests(
                 request_ids=list(range(full_len_request_num)),
                 token_nums=[num_tokens_per_request] * full_len_request_num,
@@ -573,7 +574,7 @@ def get_autotune_warmup_request():
             result.context_requests = requests
             result.generation_requests = []
 
-            return result, _create_extra_inputs(1, self.max_num_tokens)
+            return result, _create_extra_inputs(1, maximum_tunable_num_tokens)
 
         @contextlib.contextmanager
         def release_batch(result):

@@ -224,18 +224,23 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
                                               generation_overlap):
     # Test the case where the context worker capacity is exceeded and
     # needs to wait for the generation worker to complete.
+    # TODO: Autotuner is disabled due to illegal CUDA instruction error on H100.
+    # H200 does not have this issue, possibly due to a larger GPU memory.
+    # This should be investigated further.
     worker_pytorch_configs = []
 
     # Context worker
     worker_pytorch_configs.append(
         PyTorchConfig(disable_overlap_scheduler=True,
                       kv_cache_dtype="auto",
+                      autotuner_enabled=False,
                       use_cuda_graph=enable_cuda_graph))
 
     # Generation worker
     worker_pytorch_configs.append(
         PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
                       kv_cache_dtype="auto",
+                      autotuner_enabled=False,
                       use_cuda_graph=enable_cuda_graph))
 
     kv_cache_configs = [KvCacheConfig(max_tokens=128) for _ in range(2)]