Skip to content

Commit 293854a

Browse files
committed
remove PyTorchConfig completely
Signed-off-by: junq <[email protected]>
1 parent 89e0117 commit 293854a

File tree

11 files changed

+82
-305
lines changed

11 files changed

+82
-305
lines changed

examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
import tensorrt_llm
3535
from tensorrt_llm import LLM as TORCH_LLM
3636
from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
37-
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
3837
from tensorrt_llm.bindings.executor import DecodingConfig
3938
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
4039
from tensorrt_llm.llmapi import RequestOutput, SamplingParams
40+
from tensorrt_llm.llmapi.llm_args import MoeConfig
4141

4242
logger = logging.getLogger(__name__)
4343

@@ -98,10 +98,8 @@ def __init__(
9898
pytorch_config_params = {
9999
'cuda_graph_config': {} if use_cuda_graph else None,
100100
"print_iter_log": False,
101+
'moe_config': MoeConfig(backend=self.moe_backend)
101102
}
102-
if hasattr(PyTorchConfig, "moe_backend"):
103-
pytorch_config_params["moe_backend"] = self.moe_backend
104-
print(f"Info: moe_backend is set to {self.moe_backend}")
105103

106104
# stop words not currently supported by torch backend
107105
self.use_stop_words = False

tensorrt_llm/_torch/auto_deploy/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def __init__(self, **kwargs):
175175
self._executor = DemoGenerationExecutor(
176176
world_size=self.args.world_size,
177177
tokenizer=self.tokenizer,
178-
ad_config=self.args.get_pytorch_backend_config(),
178+
ad_config=self.args,
179179
)
180180

181181
def __del__(self):

tensorrt_llm/_torch/auto_deploy/llm_args.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -403,13 +403,6 @@ def validate_and_init_tokenizer(self):
403403
"""Skip tokenizer initialization in config. We do this in the AutoDeploy LLM class."""
404404
return self
405405

406-
### UTILITY METHODS ############################################################################
407-
# TODO: Remove this after the PyTorch backend is fully migrated to LlmArgs from ExecutorConfig
408-
def get_pytorch_backend_config(self) -> "LlmArgs":
409-
"""Return the LlmArgs (self) object."""
410-
# TODO: can we just pass through self directly??
411-
return type(self)(**self.to_llm_kwargs())
412-
413406
def to_dict(self) -> Dict:
414407
"""Convert model to a dictionary such that cls(**self.to_dict()) == self."""
415408
self_dict = super().to_dict()

tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,6 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
326326
dist.initialize_or_skip(rank, world_size, port)
327327

328328
# some config
329-
msg = "pytorch_backend_config must be an AD LlmArgs object"
330-
assert isinstance(ad_config, LlmArgs), msg
331329
assert ad_config.max_beam_width <= 1, "_autodeploy + beam_search is not supported"
332330

333331
max_num_sequences = ad_config.max_batch_size * dist_mapping.pp_size

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from ..attention_backend import get_sparse_attn_kv_cache_manager
2626
from ..model_config import ModelConfig
2727
from ..speculative import get_num_extra_kv_tokens, get_spec_decoder
28-
from .config import PyTorchConfig
2928
from .config_utils import is_mla, is_nemotron_hybrid, is_qwen3_next
3029
from .guided_decoder import GuidedDecoder
3130
from .kv_cache_connector import KvCacheConnectorManager
@@ -73,7 +72,7 @@ def __init__(
7372
max_seq_len: int,
7473
max_batch_size: int,
7574
kv_cache_config: KvCacheConfig,
76-
pytorch_backend_config: PyTorchConfig,
75+
llm_args: TorchLlmArgs,
7776
speculative_config: SpeculativeConfig,
7877
sparse_attention_config: SparseAttentionConfig,
7978
profiling_stage_data: Optional[dict],
@@ -86,7 +85,7 @@ def __init__(
8685
self._max_num_tokens = max_num_tokens
8786
self._max_beam_width = max_beam_width
8887
self._kv_connector_manager = kv_connector_manager
89-
self._pytorch_backend_config = pytorch_backend_config
88+
self._llm_args = llm_args
9089
self._speculative_config = speculative_config
9190
self._sparse_attention_config = sparse_attention_config
9291
self._tokens_per_block = tokens_per_block
@@ -248,9 +247,8 @@ def _get_token_num_for_estimation(self) -> int:
248247
# estimate_max_kv_cache_tokens submits self._dummy_reqs
249248
num_cache_blocks = 0
250249
num_extra_tokens_per_seq = 1 # account for generated tokens
251-
pytorch_backend_config = self._pytorch_backend_config
252250
spec_cfg = self._speculative_config
253-
if not pytorch_backend_config.disable_overlap_scheduler:
251+
if not self._llm_args.disable_overlap_scheduler:
254252
num_extra_tokens_per_seq = num_extra_tokens_per_seq + 1
255253
if spec_cfg is not None:
256254
num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens
@@ -653,7 +651,7 @@ def create_py_executor_instance(
653651
dist,
654652
resources,
655653
mapping,
656-
pytorch_backend_config,
654+
llm_args,
657655
ctx_chunk_config,
658656
model_engine,
659657
start_worker,
@@ -679,7 +677,7 @@ def create_py_executor_instance(
679677
f"max_seq_len={max_seq_len}, max_num_requests={max_batch_size}, max_num_tokens={max_num_tokens}, max_batch_size={max_batch_size}"
680678
)
681679

682-
for key, value in pytorch_backend_config.extra_resource_managers.items():
680+
for key, value in llm_args.extra_resource_managers.items():
683681
if key in resources:
684682
raise ValueError(
685683
f"Cannot overwrite existing resource manager {key}.")
@@ -804,8 +802,7 @@ def create_py_executor_instance(
804802
drafter=drafter,
805803
dist=dist,
806804
max_num_sequences=max_num_sequences,
807-
disable_overlap_scheduler=pytorch_backend_config.
808-
disable_overlap_scheduler,
805+
disable_overlap_scheduler=llm_args.disable_overlap_scheduler,
809806
max_batch_size=max_batch_size,
810807
max_beam_width=max_beam_width,
811808
max_draft_len=spec_config.max_draft_len
@@ -840,13 +837,11 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
840837
)
841838

842839

843-
def instantiate_sampler(engine: PyTorchModelEngine,
844-
pytorch_backend_config: PyTorchConfig, mapping: Mapping,
845-
max_batch_size: int, max_beam_width: int,
846-
max_seq_len: int, mm_encoder_only: bool,
847-
speculative_config: SpeculativeConfig,
848-
decoding_config: trtllm.DecodingConfig,
849-
kv_cache_config: KvCacheConfig):
840+
def instantiate_sampler(
841+
engine: PyTorchModelEngine, llm_args: TorchLlmArgs, mapping: Mapping,
842+
max_batch_size: int, max_beam_width: int, max_seq_len: int,
843+
mm_encoder_only: bool, speculative_config: SpeculativeConfig,
844+
decoding_config: trtllm.DecodingConfig, kv_cache_config: KvCacheConfig):
850845
sampler_args = create_torch_sampler_args(
851846
mapping,
852847
max_seq_len=engine.max_seq_len,
@@ -856,7 +851,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
856851
decoding_mode = get_decoding_mode(decoding_config=decoding_config,
857852
max_beam_width=max_beam_width)
858853
if mapping.cp_config.get('cp_type') == CpType.STAR:
859-
assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
854+
assert llm_args.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
860855
return TorchSampler(sampler_args)
861856
if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder(
862857
):
@@ -865,15 +860,15 @@ def instantiate_sampler(engine: PyTorchModelEngine,
865860
if mm_encoder_only:
866861
# NOTE: handle model outputs specially for mm encoder executor/engine
867862
return EarlyStopWithMMResult()
868-
if pytorch_backend_config.sampler_type == SamplerType.TRTLLMSampler or (
869-
pytorch_backend_config.sampler_type == SamplerType.auto
863+
if llm_args.sampler_type == SamplerType.TRTLLMSampler or (
864+
llm_args.sampler_type == SamplerType.auto
870865
and decoding_mode.isBeamSearch()):
871866
logger.debug(f"DecodingMode: {decoding_mode.name}")
872867
return TRTLLMSampler(engine.model,
873868
engine.dtype,
874869
mapping,
875870
decoding_mode,
876-
pytorch_backend_config.disable_overlap_scheduler,
871+
llm_args.disable_overlap_scheduler,
877872
max_seq_len=max_seq_len,
878873
max_batch_size=max_batch_size,
879874
max_beam_width=max_beam_width,
@@ -935,7 +930,12 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int:
935930
return num_experts
936931

937932

938-
def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig):
933+
def _adjust_torch_mem_fraction():
934+
# If true, adjust PyTorch CUDA memory fraction to correspond to the
935+
# total GPU memory minus the statically allocated engine memory.
936+
# If false, set the PyTorch CUDA memory fraction to 1.0.
937+
_limit_torch_cuda_mem_fraction: bool = True
938+
939939
# FIXME: PyTorch only uses the garbage_collection_threshold setting
940940
# if a memory fraction is set, cf.
941941
# https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357
@@ -964,7 +964,7 @@ def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig):
964964
# lead PyTorch to release all unused memory before hitting the set fraction. This
965965
# still mitigates OOM, although at a higher performance impact, because it
966966
# effectively resets the allocator cache.
967-
if not pytorch_backend_config._limit_torch_cuda_mem_fraction:
967+
if not _limit_torch_cuda_mem_fraction:
968968
return
969969
mem_reserved = torch.cuda.memory_reserved()
970970
mem_free, mem_total = torch.cuda.mem_get_info()

tensorrt_llm/_torch/pyexecutor/config.py

Lines changed: 0 additions & 139 deletions
This file was deleted.

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,12 @@
5454
from ..utils import (get_model_extra_attrs,
5555
set_per_request_piecewise_cuda_graph_flag,
5656
set_torch_compiling, with_model_extra_attrs)
57-
from .config import _construct_checkpoint_loader
5857
from .config_utils import is_mla
5958
from .cuda_graph_runner import CUDAGraphRunner
6059
from .guided_decoder import CapturableGuidedDecoder
6160
from .layerwise_nvtx_marker import LayerwiseNvtxMarker
6261
from .llm_request import get_draft_token_length
63-
from .model_loader import ModelLoader
62+
from .model_loader import ModelLoader, _construct_checkpoint_loader
6463
from .resource_manager import (BaseResourceManager, KVCacheManager,
6564
ResourceManager, ResourceManagerType)
6665
from .sampler import SampleStateTensors

0 commit comments

Comments
 (0)