NVIDIA · Superjomn · Jun 19, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
 
 ## Quick Start
 
-Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
+Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
 
 ```{literalinclude} ../../examples/pytorch/quickstart.py
     :language: python
@@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
 which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 llm.generate("Hello, my name is")
 ```
@@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
 In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
           enable_trtllm_sampler=True)
 sampling_params = SamplingParams(

@@ -186,7 +186,7 @@ __all__ = [
 Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 import modeling_mymodel
 
 def main():

@@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
 
 ## Top Level API
 
-The interface for PyTorch backend is `tensorrt._torch.LLM`.
+The interface for PyTorch backend is `tensorrt_llm.LLM`.
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model=<path_to_llama_from_hf>)
 ```
 

@@ -5,7 +5,8 @@
 import colorama
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 
 class LlmConsole(code.InteractiveConsole):

@@ -18,8 +18,9 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import CppExecutorError, RequestError
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 

@@ -7,11 +7,12 @@
 import torch
 from simple_config import SimpleConfig
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
 from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
 from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-from tensorrt_llm.llmapi.llm import LLM, RequestOutput
+from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
 from tensorrt_llm.sampling_params import SamplingParams
 

@@ -1,5 +1,6 @@
 ### Automatic Parallelism with LLM
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -1,7 +1,7 @@
 ### Generate Text Using Eagle2 Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  SamplingParams)
 
 

@@ -1,8 +1,8 @@
 ### Generate Text Using Eagle Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
-                                 SamplingParams)
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 
 
 def main():

@@ -1,5 +1,6 @@
 ### Generate text with guided decoding
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import GuidedDecodingParams
 
 

@@ -1,7 +1,8 @@
 ### Generate text
 import tempfile
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -1,7 +1,8 @@
 ### Generate Text Asynchronously
 import asyncio
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -1,7 +1,8 @@
 ### Generate Text in Streaming
 import asyncio
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -1,7 +1,8 @@
 ### Generate text with customization
 import tempfile
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 
 def main():

@@ -1,5 +1,6 @@
 ### Distributed LLM Generation
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -1,6 +1,7 @@
 ### Get KV Cache Events
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 
 

@@ -3,7 +3,7 @@
 
 import torch
 
-from tensorrt_llm import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
                                           LogitsProcessor, SamplingParams)
 

@@ -1,6 +1,6 @@
 ### Generate Text Using Lookahead Decoding
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                  LookaheadDecodingConfig, SamplingParams)
 
 

@@ -2,8 +2,8 @@
 import argparse
 from pathlib import Path
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                  MedusaDecodingConfig, SamplingParams)
 
 

@@ -1,8 +1,9 @@
 ### Generate text with multiple LoRA adapters
 from huggingface_hub import snapshot_download
 
-from tensorrt_llm import LLM, BuildConfig
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import LoRARequest
+from tensorrt_llm.llmapi import BuildConfig
 from tensorrt_llm.lora_manager import LoraConfig
 
 

@@ -3,7 +3,8 @@
 
 import torch
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
 
 major, minor = torch.cuda.get_device_capability()

@@ -1,4 +1,5 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():

@@ -32,12 +32,12 @@
 from tqdm import tqdm
 
 import tensorrt_llm
-from tensorrt_llm._torch import LLM as TORCH_LLM
+from tensorrt_llm import LLM as TORCH_LLM
+from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.bindings.executor import DecodingConfig
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 from tensorrt_llm.llmapi import RequestOutput, SamplingParams
-from tensorrt_llm.llmapi.llm import LLM as TRT_LLM
 
 logger = logging.getLogger(__name__)
 

@@ -1,6 +1,6 @@
 import modeling_opt  # noqa
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 
 
 def main():

@@ -1,5 +1,4 @@
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 
 
 def main():

@@ -1,7 +1,6 @@
 import argparse
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
                                  KvCacheConfig, MTPDecodingConfig,
                                  NGramDecodingConfig, TorchCompileConfig)

@@ -6,8 +6,7 @@
 
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 

diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py
@@ -46,6 +46,7 @@ def _add_trt_llm_dll_directory():
 from .disaggregated_params import DisaggregatedParams
 from .functional import Tensor, constant
 from .llmapi import LLM, LlmArgs
+from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
 from .logger import logger
 from .mapping import Mapping
 from .models.automodel import AutoConfig, AutoModelForCausalLM
@@ -98,6 +99,8 @@ def _add_trt_llm_dll_directory():
     'tools',
     'LLM',
     'LlmArgs',
+    'TorchLlmArgs',
+    'TrtLlmArgs',
     'SamplingParams',
     'DisaggregatedParams',
     'KvCacheConfig',

diff --git a/tensorrt_llm/_tensorrt_engine/__init__.py b/tensorrt_llm/_tensorrt_engine/__init__.py
@@ -0,0 +1,3 @@
+from tensorrt_llm.llmapi.llm import _TrtLLM as LLM
+
+__all__ = ['LLM']
@@ -1,3 +1,4 @@
 from .llm import LLM
+from .model_config import MoeLoadBalancerConfig
 
-__all__ = ["LLM"]
+__all__ = ["LLM", "MoeLoadBalancerConfig"]
@@ -10,11 +10,12 @@
 import torch.multiprocessing as mp
 from transformers import PreTrainedTokenizerBase
 
+from ...._tensorrt_engine import LLM
 from ....executor import GenerationExecutor
 from ....executor.request import GenerationRequest
 from ....executor.result import CompletionOutput, GenerationResult
 from ....inputs.registry import create_input_processor
-from ....llmapi.llm import LLM, RequestOutput
+from ....llmapi.llm import RequestOutput
 from ....llmapi.llm_args import _AutoDeployLlmArgs
 from ....llmapi.tokenizer import TokenizerBase
 from ....sampling_params import SamplingParams

@@ -1,3 +1,13 @@
-from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
+from tensorrt_llm.llmapi.llm import _TorchLLM
 
+
+class LLM(_TorchLLM):
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
+        )
+
+
+# Keep the LLM class to guide the users to use the default LLM class
 __all__ = ['LLM']
@@ -10,13 +10,14 @@
 from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
                                 optgroup)
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
 # isort: off

@@ -17,15 +17,16 @@
 from tensorrt_llm.bench.benchmark.utils.general import (
     get_settings_from_engine, get_settings)
 # isort: on
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
                                            update_metadata_for_multimodal)
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 

@@ -9,7 +9,8 @@
 from zmq import PUSH
 from zmq.asyncio import Context
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.dataclasses.general import InferenceRequest
 from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
 from tensorrt_llm.executor.postproc_worker import PostprocParams

@@ -9,7 +9,7 @@
 from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
 from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from tensorrt_llm.llmapi.llm import _TrtLLM as LLM

		__all__ = ['LLM']