21
21
from .config_utils import is_mla , is_nemotron_hybrid
22
22
from .kv_cache_transceiver import AttentionTypeCpp , create_kv_cache_transceiver
23
23
from .llm_request import ExecutorResponse
24
- from .model_engine import (DRAFT_KV_CACHE_MANAGER_KEY , KV_CACHE_MANAGER_KEY ,
25
- PyTorchModelEngine )
24
+ from .model_engine import PyTorchModelEngine
26
25
from .py_executor import PyExecutor
27
26
from .resource_manager import (KVCacheManager , MambaHybridCacheManager ,
28
- PeftCacheManager , ResourceManager )
27
+ PeftCacheManager , ResourceManager ,
28
+ ResourceManagerType )
29
29
from .sampler import (EarlyStopSampler , TorchSampler , TorchStarAttentionSampler ,
30
30
TRTLLMSampler )
31
31
from .scheduler import (BindCapacityScheduler , BindMicroBatchScheduler ,
@@ -245,7 +245,7 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
245
245
f"Memory used outside torch (e.g., NCCL and CUDA graphs) in memory usage profiling: { extra_cost / (GB ):.2f} GiB"
246
246
)
247
247
kv_stats = py_executor .resource_manager .resource_managers .get (
248
- "kv_cache_manager" ).get_kv_cache_stats ()
248
+ ResourceManagerType . KV_CACHE_MANAGER ).get_kv_cache_stats ()
249
249
250
250
kv_cache_max_tokens = self ._cal_max_tokens (
251
251
peak_memory , total_gpu_memory , fraction ,
@@ -349,7 +349,7 @@ def _create_kv_cache_manager(
349
349
spec_config = spec_config ,
350
350
)
351
351
# KVCacheManager (Non-draft) modifies the max_seq_len field, update it to executor_config
352
- if model_engine .kv_cache_manager_key == KV_CACHE_MANAGER_KEY :
352
+ if model_engine .kv_cache_manager_key == ResourceManagerType . KV_CACHE_MANAGER :
353
353
executor_config .max_seq_len = kv_cache_manager .max_seq_len
354
354
355
355
return kv_cache_manager
@@ -360,17 +360,19 @@ def build_managers(self, resources: Dict) -> None:
360
360
draft_kv_cache_manager = self ._create_kv_cache_manager (
361
361
self ._draft_model_engine
362
362
) if self ._draft_model_engine is not None else None
363
- resources [KV_CACHE_MANAGER_KEY ] = kv_cache_manager
364
- resources [DRAFT_KV_CACHE_MANAGER_KEY ] = draft_kv_cache_manager
363
+ resources [ResourceManagerType .KV_CACHE_MANAGER ] = kv_cache_manager
364
+ resources [
365
+ ResourceManagerType .DRAFT_KV_CACHE_MANAGER ] = draft_kv_cache_manager
365
366
366
367
def teardown_managers (self , resources : Dict ) -> None :
367
368
"""Clean up KV caches for model and draft model (if applicable)."""
368
- resources [KV_CACHE_MANAGER_KEY ].shutdown ()
369
- del resources [KV_CACHE_MANAGER_KEY ]
370
- draft_kv_cache_manager = resources [DRAFT_KV_CACHE_MANAGER_KEY ]
369
+ resources [ResourceManagerType .KV_CACHE_MANAGER ].shutdown ()
370
+ del resources [ResourceManagerType .KV_CACHE_MANAGER ]
371
+ draft_kv_cache_manager = resources [
372
+ ResourceManagerType .DRAFT_KV_CACHE_MANAGER ]
371
373
if draft_kv_cache_manager :
372
374
draft_kv_cache_manager .shutdown ()
373
- del resources [DRAFT_KV_CACHE_MANAGER_KEY ]
375
+ del resources [ResourceManagerType . DRAFT_KV_CACHE_MANAGER ]
374
376
375
377
376
378
def create_py_executor_instance (
@@ -386,7 +388,7 @@ def create_py_executor_instance(
386
388
sampler ,
387
389
lora_config : Optional [LoraConfig ] = None ,
388
390
garbage_collection_gen0_threshold : Optional [int ] = None ) -> PyExecutor :
389
- kv_cache_manager = resources .get (KV_CACHE_MANAGER_KEY , None )
391
+ kv_cache_manager = resources .get (ResourceManagerType . KV_CACHE_MANAGER , None )
390
392
391
393
spec_config = model_engine .spec_config
392
394
if mapping .is_last_pp_rank (
@@ -463,22 +465,23 @@ def create_py_executor_instance(
463
465
model_config = model_binding_config ,
464
466
world_config = world_config ,
465
467
)
466
- resources ["peft_cache_manager" ] = peft_cache_manager
468
+ resources [ResourceManagerType . PEFT_CACHE_MANAGER ] = peft_cache_manager
467
469
model_engine .set_lora_model_config (
468
470
lora_config .lora_target_modules ,
469
471
lora_config .trtllm_modules_to_hf_modules )
470
472
471
473
max_num_sequences = executor_config .max_batch_size * mapping .pp_size
472
474
473
- resources ["seq_slot_manager" ] = SeqSlotManager (max_num_sequences )
475
+ resources [ResourceManagerType .SEQ_SLOT_MANAGER ] = SeqSlotManager (
476
+ max_num_sequences )
474
477
475
478
resource_manager = ResourceManager (resources )
476
479
477
480
# Make sure the kv cache manager is always invoked last as it could
478
481
# depend on the results of other resource managers.
479
482
if kv_cache_manager is not None :
480
- resource_manager .resource_managers .move_to_end ("kv_cache_manager" ,
481
- last = True )
483
+ resource_manager .resource_managers .move_to_end (
484
+ ResourceManagerType . KV_CACHE_MANAGER , last = True )
482
485
483
486
capacity_scheduler = BindCapacityScheduler (
484
487
max_num_sequences ,
0 commit comments