From 952e012cd12d00bbeff4d8ef101bdefc07a30f43 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Wed, 18 Jun 2025 07:49:09 +0000
Subject: [PATCH 1/2] make PyT default

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 examples/apps/chat.py                                |  3 ++-
 examples/apps/fastapi_server.py                      |  3 ++-
 examples/auto_deploy/build_and_run_ad.py             |  3 ++-
 examples/llm-api/llm_auto_parallel.py                |  3 ++-
 examples/llm-api/llm_eagle2_decoding.py              |  4 ++--
 examples/llm-api/llm_eagle_decoding.py               |  6 +++---
 examples/llm-api/llm_guided_decoding.py              |  3 ++-
 examples/llm-api/llm_inference.py                    |  3 ++-
 examples/llm-api/llm_inference_async.py              |  3 ++-
 examples/llm-api/llm_inference_async_streaming.py    |  3 ++-
 examples/llm-api/llm_inference_customize.py          |  3 ++-
 examples/llm-api/llm_inference_distributed.py        |  3 ++-
 examples/llm-api/llm_inference_kv_events.py          |  3 ++-
 examples/llm-api/llm_logits_processor.py             |  2 +-
 examples/llm-api/llm_lookahead_decoding.py           |  4 ++--
 examples/llm-api/llm_medusa_decoding.py              |  4 ++--
 examples/llm-api/llm_multilora.py                    |  3 ++-
 examples/llm-api/llm_quantization.py                 |  3 ++-
 examples/llm-api/quickstart_example.py               |  3 ++-
 .../llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py |  4 ++--
 tensorrt_llm/__init__.py                             |  3 +++
 tensorrt_llm/_tensorrt_engine/__init__.py            |  3 +++
 tensorrt_llm/_torch/auto_deploy/shim/demollm.py      |  3 ++-
 tensorrt_llm/_torch/llm.py                           | 12 +++++++++++-
 tensorrt_llm/bench/benchmark/low_latency.py          |  3 ++-
 tensorrt_llm/bench/benchmark/throughput.py           |  5 +++--
 tensorrt_llm/bench/benchmark/utils/asynchronous.py   |  3 ++-
 tensorrt_llm/commands/eval.py                        |  5 +++--
 tensorrt_llm/commands/serve.py                       |  5 +++--
 tensorrt_llm/evaluate/cnn_dailymail.py               |  5 +++--
 tensorrt_llm/evaluate/json_mode_eval.py              |  5 +++--
 tensorrt_llm/evaluate/lm_eval.py                     |  5 +++--
 tensorrt_llm/evaluate/mmlu.py                        |  5 +++--
 tensorrt_llm/llmapi/__init__.py                      |  4 +---
 tensorrt_llm/llmapi/llm.py                           | 10 ++++------
 tensorrt_llm/scaffolding/worker.py                   |  2 +-
 tensorrt_llm/serve/openai_server.py                  |  2 +-
 tests/integration/defs/accuracy/accuracy_core.py     |  5 +++--
 tests/integration/defs/accuracy/test_llm_api.py      |  3 ++-
 .../defs/examples/run_llm_fp8_quant_llama_70b.py     |  3 ++-
 .../defs/examples/run_llm_quickstart_atexit.py       |  3 ++-
 tests/integration/defs/llmapi/_run_llmapi_llm.py     |  3 ++-
 tests/integration/defs/llmapi/test_llm_e2e.py        |  2 +-
 .../unit/singlegpu/shim/test_llm_config.py           |  2 +-
 tests/unittest/api_stability/api_stability_core.py   |  3 ++-
 tests/unittest/api_stability/test_llm_api.py         |  3 ++-
 tests/unittest/llmapi/apps/_test_openai_metrics.py   |  2 +-
 .../unittest/llmapi/apps/_test_openai_multi_chat.py  |  2 +-
 tests/unittest/llmapi/run_llm.py                     |  3 ++-
 tests/unittest/llmapi/run_llm_with_postproc.py       |  3 ++-
 tests/unittest/llmapi/test_executor.py               |  3 ++-
 tests/unittest/llmapi/test_llm.py                    |  3 ++-
 tests/unittest/llmapi/test_llm_args.py               |  2 +-
 tests/unittest/llmapi/test_llm_download.py           |  2 +-
 tests/unittest/llmapi/test_llm_kv_cache_events.py    |  3 ++-
 tests/unittest/llmapi/test_llm_multi_gpu.py          |  3 ++-
 tests/unittest/llmapi/test_llm_quant.py              |  3 ++-
 .../all_models/llmapi/tensorrt_llm/1/model.py        |  4 ++--
 58 files changed, 128 insertions(+), 78 deletions(-)
 create mode 100644 tensorrt_llm/_tensorrt_engine/__init__.py

diff --git a/examples/apps/chat.py b/examples/apps/chat.py
index 855443f6f7e..620a3e95b77 100755
--- a/examples/apps/chat.py
+++ b/examples/apps/chat.py
@@ -5,7 +5,8 @@
 import colorama
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 
 class LlmConsole(code.InteractiveConsole):
diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py
index 7f448d4685c..b2aa0baf2ab 100755
--- a/examples/apps/fastapi_server.py
+++ b/examples/apps/fastapi_server.py
@@ -18,8 +18,9 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import CppExecutorError, RequestError
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
index 882681d2bf9..e257aec5015 100644
--- a/examples/auto_deploy/build_and_run_ad.py
+++ b/examples/auto_deploy/build_and_run_ad.py
@@ -7,11 +7,12 @@
 import torch
 from simple_config import SimpleConfig
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
 from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
 from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-from tensorrt_llm.llmapi.llm import LLM, RequestOutput
+from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/examples/llm-api/llm_auto_parallel.py b/examples/llm-api/llm_auto_parallel.py
index 5d1ce835127..be496b9bbe9 100644
--- a/examples/llm-api/llm_auto_parallel.py
+++ b/examples/llm-api/llm_auto_parallel.py
@@ -1,5 +1,6 @@
 ### Automatic Parallelism with LLM
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-api/llm_eagle2_decoding.py b/examples/llm-api/llm_eagle2_decoding.py
index 2e53a9b88fc..6a84d927092 100755
--- a/examples/llm-api/llm_eagle2_decoding.py
+++ b/examples/llm-api/llm_eagle2_decoding.py
@@ -1,7 +1,7 @@
 ### Generate Text Using Eagle2 Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  SamplingParams)
 
 
diff --git a/examples/llm-api/llm_eagle_decoding.py b/examples/llm-api/llm_eagle_decoding.py
index 80e3d8f7203..87a113411a0 100644
--- a/examples/llm-api/llm_eagle_decoding.py
+++ b/examples/llm-api/llm_eagle_decoding.py
@@ -1,8 +1,8 @@
 ### Generate Text Using Eagle Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
-                                 SamplingParams)
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 
 
 def main():
diff --git a/examples/llm-api/llm_guided_decoding.py b/examples/llm-api/llm_guided_decoding.py
index 6719a3d1e4d..1138c63f969 100644
--- a/examples/llm-api/llm_guided_decoding.py
+++ b/examples/llm-api/llm_guided_decoding.py
@@ -1,5 +1,6 @@
 ### Generate text with guided decoding
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import GuidedDecodingParams
 
 
diff --git a/examples/llm-api/llm_inference.py b/examples/llm-api/llm_inference.py
index 7eb71256406..a5335e7a375 100644
--- a/examples/llm-api/llm_inference.py
+++ b/examples/llm-api/llm_inference.py
@@ -1,7 +1,8 @@
 ### Generate text
 import tempfile
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-api/llm_inference_async.py b/examples/llm-api/llm_inference_async.py
index f025d3a1553..f7c85189ada 100644
--- a/examples/llm-api/llm_inference_async.py
+++ b/examples/llm-api/llm_inference_async.py
@@ -1,7 +1,8 @@
 ### Generate Text Asynchronously
 import asyncio
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-api/llm_inference_async_streaming.py b/examples/llm-api/llm_inference_async_streaming.py
index 00de565dbbd..c05a231b26d 100644
--- a/examples/llm-api/llm_inference_async_streaming.py
+++ b/examples/llm-api/llm_inference_async_streaming.py
@@ -1,7 +1,8 @@
 ### Generate Text in Streaming
 import asyncio
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-api/llm_inference_customize.py b/examples/llm-api/llm_inference_customize.py
index d6b17e51ea1..c18dcc4de14 100644
--- a/examples/llm-api/llm_inference_customize.py
+++ b/examples/llm-api/llm_inference_customize.py
@@ -1,7 +1,8 @@
 ### Generate text with customization
 import tempfile
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 
 def main():
diff --git a/examples/llm-api/llm_inference_distributed.py b/examples/llm-api/llm_inference_distributed.py
index 5ad9c98117c..cfdccf774ec 100644
--- a/examples/llm-api/llm_inference_distributed.py
+++ b/examples/llm-api/llm_inference_distributed.py
@@ -1,5 +1,6 @@
 ### Distributed LLM Generation
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py
index 009b7e0dee7..ff8d36e18cc 100644
--- a/examples/llm-api/llm_inference_kv_events.py
+++ b/examples/llm-api/llm_inference_kv_events.py
@@ -1,6 +1,7 @@
 ### Get KV Cache Events
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
diff --git a/examples/llm-api/llm_logits_processor.py b/examples/llm-api/llm_logits_processor.py
index 516cd507107..96c81802b19 100644
--- a/examples/llm-api/llm_logits_processor.py
+++ b/examples/llm-api/llm_logits_processor.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from tensorrt_llm import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
                                           LogitsProcessor, SamplingParams)
 
diff --git a/examples/llm-api/llm_lookahead_decoding.py b/examples/llm-api/llm_lookahead_decoding.py
index cb5c7bb75de..ed2c94450dd 100644
--- a/examples/llm-api/llm_lookahead_decoding.py
+++ b/examples/llm-api/llm_lookahead_decoding.py
@@ -1,6 +1,6 @@
 ### Generate Text Using Lookahead Decoding
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                  LookaheadDecodingConfig, SamplingParams)
 
 
diff --git a/examples/llm-api/llm_medusa_decoding.py b/examples/llm-api/llm_medusa_decoding.py
index 6b4ba46434b..e7594613ec4 100644
--- a/examples/llm-api/llm_medusa_decoding.py
+++ b/examples/llm-api/llm_medusa_decoding.py
@@ -2,8 +2,8 @@
 import argparse
 from pathlib import Path
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                  MedusaDecodingConfig, SamplingParams)
 
 
diff --git a/examples/llm-api/llm_multilora.py b/examples/llm-api/llm_multilora.py
index e3cfe0fe142..00eed1fe028 100644
--- a/examples/llm-api/llm_multilora.py
+++ b/examples/llm-api/llm_multilora.py
@@ -1,8 +1,9 @@
 ### Generate text with multiple LoRA adapters
 from huggingface_hub import snapshot_download
 
-from tensorrt_llm import LLM, BuildConfig
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import LoRARequest
+from tensorrt_llm.llmapi import BuildConfig
 from tensorrt_llm.lora_manager import LoraConfig
 
 
diff --git a/examples/llm-api/llm_quantization.py b/examples/llm-api/llm_quantization.py
index 99dbe8cd60a..24bcfa60e07 100644
--- a/examples/llm-api/llm_quantization.py
+++ b/examples/llm-api/llm_quantization.py
@@ -3,7 +3,8 @@
 
 import torch
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
 
 major, minor = torch.cuda.get_device_capability()
diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py
index 8dff6a47049..4c45eec83ed 100644
--- a/examples/llm-api/quickstart_example.py
+++ b/examples/llm-api/quickstart_example.py
@@ -1,4 +1,5 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
diff --git a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
index 715bdd116f0..d593f3380f2 100644
--- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
+++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
@@ -32,12 +32,12 @@
 from tqdm import tqdm
 
 import tensorrt_llm
-from tensorrt_llm._torch import LLM as TORCH_LLM
+from tensorrt_llm import LLM as TORCH_LLM
+from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.bindings.executor import DecodingConfig
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 from tensorrt_llm.llmapi import RequestOutput, SamplingParams
-from tensorrt_llm.llmapi.llm import LLM as TRT_LLM
 
 logger = logging.getLogger(__name__)
 
diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py
index 9c59d5bee25..01589ebc7e3 100644
--- a/tensorrt_llm/__init__.py
+++ b/tensorrt_llm/__init__.py
@@ -46,6 +46,7 @@ def _add_trt_llm_dll_directory():
 from .disaggregated_params import DisaggregatedParams
 from .functional import Tensor, constant
 from .llmapi import LLM, LlmArgs
+from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
 from .logger import logger
 from .mapping import Mapping
 from .models.automodel import AutoConfig, AutoModelForCausalLM
@@ -98,6 +99,8 @@ def _add_trt_llm_dll_directory():
     'tools',
     'LLM',
     'LlmArgs',
+    'TorchLlmArgs',
+    'TrtLlmArgs',
     'SamplingParams',
     'DisaggregatedParams',
     'KvCacheConfig',
diff --git a/tensorrt_llm/_tensorrt_engine/__init__.py b/tensorrt_llm/_tensorrt_engine/__init__.py
new file mode 100644
index 00000000000..39669a168fd
--- /dev/null
+++ b/tensorrt_llm/_tensorrt_engine/__init__.py
@@ -0,0 +1,3 @@
+from tensorrt_llm.llmapi.llm import _TrtLLM as LLM
+
+__all__ = ['LLM']
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
index 791b06761b7..13c9a7374b0 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
@@ -10,11 +10,12 @@
 import torch.multiprocessing as mp
 from transformers import PreTrainedTokenizerBase
 
+from ...._tensorrt_engine import LLM
 from ....executor import GenerationExecutor
 from ....executor.request import GenerationRequest
 from ....executor.result import CompletionOutput, GenerationResult
 from ....inputs.registry import create_input_processor
-from ....llmapi.llm import LLM, RequestOutput
+from ....llmapi.llm import RequestOutput
 from ....llmapi.llm_args import _AutoDeployLlmArgs
 from ....llmapi.tokenizer import TokenizerBase
 from ....sampling_params import SamplingParams
diff --git a/tensorrt_llm/_torch/llm.py b/tensorrt_llm/_torch/llm.py
index 61b4d55b0b2..2fb41570b34 100644
--- a/tensorrt_llm/_torch/llm.py
+++ b/tensorrt_llm/_torch/llm.py
@@ -1,3 +1,13 @@
-from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
+from tensorrt_llm.llmapi.llm import _TorchLLM
 
+
+class LLM(_TorchLLM):
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
+        )
+
+
+# Keep the LLM class to guide the users to use the default LLM class
 __all__ = ['LLM']
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
index f9299a7de36..490ac62f4f5 100644
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -10,13 +10,14 @@
 from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
                                 optgroup)
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
 # isort: off
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index e63aa25662a..fd9ad5016e0 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -17,7 +17,8 @@
 from tensorrt_llm.bench.benchmark.utils.general import (
     get_settings_from_engine, get_settings)
 # isort: on
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
@@ -25,7 +26,7 @@
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
                                            update_metadata_for_multimodal)
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tensorrt_llm/bench/benchmark/utils/asynchronous.py b/tensorrt_llm/bench/benchmark/utils/asynchronous.py
index af6ae18ab5a..99fd06e4f39 100644
--- a/tensorrt_llm/bench/benchmark/utils/asynchronous.py
+++ b/tensorrt_llm/bench/benchmark/utils/asynchronous.py
@@ -9,7 +9,8 @@
 from zmq import PUSH
 from zmq.asyncio import Context
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.dataclasses.general import InferenceRequest
 from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
 from tensorrt_llm.executor.postproc_worker import PostprocParams
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index 0c7e6956c78..eff80d1a69d 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -18,10 +18,11 @@
 
 import tensorrt_llm.profiler as profiler
 
-from .._torch.llm import LLM as PyTorchLLM
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
 from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
                         GPQAMain, JsonModeEval)
-from ..llmapi import LLM, BuildConfig, KvCacheConfig
+from ..llmapi import BuildConfig, KvCacheConfig
 from ..llmapi.llm_utils import update_llm_args_with_extra_options
 from ..logger import logger, severity_map
 
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 6a970f93941..ddbcba2a115 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -11,10 +11,11 @@
 from strenum import StrEnum
 from torch.cuda import device_count
 
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.executor.utils import LlmLauncherEnvs
-from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy,
+from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
                                  DynamicBatchConfig, KvCacheConfig,
                                  SchedulerConfig)
 from tensorrt_llm.llmapi.disagg_utils import (CtxGenServerConfig,
diff --git a/tensorrt_llm/evaluate/cnn_dailymail.py b/tensorrt_llm/evaluate/cnn_dailymail.py
index e2dfe3056f1..a5bb14eadaa 100644
--- a/tensorrt_llm/evaluate/cnn_dailymail.py
+++ b/tensorrt_llm/evaluate/cnn_dailymail.py
@@ -18,8 +18,9 @@
 import datasets
 import evaluate
 
-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
index cd3b8a586ec..69c41699cd1 100644
--- a/tensorrt_llm/evaluate/json_mode_eval.py
+++ b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -19,8 +19,9 @@
 import datasets
 import numpy as np
 
-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .interface import Evaluator
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index 3ec1be52399..bdddbcbb736 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -28,8 +28,9 @@
 except ImportError:
     TemplateLM = object
 
-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py
index 0e6df91df09..92d7ae1171a 100644
--- a/tensorrt_llm/evaluate/mmlu.py
+++ b/tensorrt_llm/evaluate/mmlu.py
@@ -40,8 +40,9 @@
 import numpy as np
 import pandas as pd
 
-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py
index eb906e97668..2fe491db0ab 100644
--- a/tensorrt_llm/llmapi/__init__.py
+++ b/tensorrt_llm/llmapi/__init__.py
@@ -2,7 +2,7 @@
 from ..executor import CompletionOutput, RequestError
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .build_cache import BuildCacheConfig
-from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM
+from .llm import LLM, RequestOutput
 # yapf: disable
 from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
                        CapacitySchedulerPolicy, ContextChunkingPolicy,
@@ -50,6 +50,4 @@
     'LlmArgs',
     'TorchLlmArgs',
     'TrtLlmArgs',
-    '_TrtLLM',
-    '_TorchLLM',
 ]
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 5a8c68643ee..a1f1634f1b5 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -896,7 +896,7 @@ def __init__(self,
                          **kwargs)
 
 
-class LLM(_TrtLLM):
+class LLM(_TorchLLM):
 
     def __init__(self,
                  model: Union[str, Path],
@@ -915,15 +915,13 @@ def __init__(self,
                          revision, tokenizer_revision, **kwargs)
 
 
-_LLM_REPR = "TrtLLM"
+_LLM_REPR = "TorchLLM"
 
 # sphinx will ignore the LLM's docstring if it is not explicitly set
 LLM.__doc__ = \
     f"""LLM class is the main class for running a LLM model.
 
-    This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend
-    and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0.
-    The default backend is the TensorRT backend.
+    This class is an alias of {_LLM_REPR}.
 
     Parameters:
-""" + TRT_LLM_DOCSTRING
+""" + TORCH_LLM_DOCSTRING
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
index d9a2cd2086d..69086392648 100644
--- a/tensorrt_llm/scaffolding/worker.py
+++ b/tensorrt_llm/scaffolding/worker.py
@@ -4,8 +4,8 @@
 import openai
 from transformers import AutoTokenizer
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationExecutor
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
index 9223c9ddd7b..edc5b5f6f62 100644
--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@@ -14,12 +14,12 @@
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from transformers import AutoConfig, AutoProcessor
 
+from tensorrt_llm._tensorrt_engine import LLM
 # yapf: disable
 from tensorrt_llm.executor import CppExecutorError
 from tensorrt_llm.executor.postproc_worker import PostprocParams
 from tensorrt_llm.inputs import prompt_inputs
 from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template
-from tensorrt_llm.llmapi import LLM
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole
 from tensorrt_llm.llmapi.llm import RequestOutput
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
index 93f5b2198df..811bb80a109 100644
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@@ -23,10 +23,11 @@
 import yaml
 
 import tensorrt_llm.evaluate
-from tensorrt_llm._torch import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.speculative import SpecConfig
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM, SamplingParams
+from tensorrt_llm.llmapi import SamplingParams
 from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index 7b2978d8096..8cdb49cf561 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 import pytest
 
-from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
diff --git a/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py b/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py
index 5ad05ae5547..fffe95a4a46 100644
--- a/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py
+++ b/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py
@@ -1,7 +1,8 @@
 import os
 from pathlib import Path
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import QuantAlgo, QuantConfig
 
 prompts = [
diff --git a/tests/integration/defs/examples/run_llm_quickstart_atexit.py b/tests/integration/defs/examples/run_llm_quickstart_atexit.py
index 6738d9396ee..7f2bc2f9225 100644
--- a/tests/integration/defs/examples/run_llm_quickstart_atexit.py
+++ b/tests/integration/defs/examples/run_llm_quickstart_atexit.py
@@ -1,7 +1,8 @@
 import os
 from pathlib import Path
 
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 if __name__ == '__main__':
     prompts = [
diff --git a/tests/integration/defs/llmapi/_run_llmapi_llm.py b/tests/integration/defs/llmapi/_run_llmapi_llm.py
index f8d4ae5c8aa..854af24efa7 100644
--- a/tests/integration/defs/llmapi/_run_llmapi_llm.py
+++ b/tests/integration/defs/llmapi/_run_llmapi_llm.py
@@ -3,7 +3,8 @@
 
 import click
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, SamplingParams
 
 
 @click.command()
diff --git a/tests/integration/defs/llmapi/test_llm_e2e.py b/tests/integration/defs/llmapi/test_llm_e2e.py
index 30d5b71fab4..778b870f4f6 100644
--- a/tests/integration/defs/llmapi/test_llm_e2e.py
+++ b/tests/integration/defs/llmapi/test_llm_e2e.py
@@ -23,7 +23,7 @@
 from defs.conftest import llm_models_root, unittest_path
 from defs.trt_test_alternative import check_call
 
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import BuildConfig
 
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
index 49bd2e23308..a256d1f57a5 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
@@ -2,9 +2,9 @@
 
 import pytest
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.shim.demollm import DemoLLM
 from tensorrt_llm._torch.auto_deploy.transformations.transform import InferenceOptimizer
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig, _AutoDeployLlmArgs
 
 # ================================
diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py
index 62b0d06400f..e0cde0e4af4 100644
--- a/tests/unittest/api_stability/api_stability_core.py
+++ b/tests/unittest/api_stability/api_stability_core.py
@@ -17,9 +17,10 @@
 from pydantic import BaseModel
 
 import tensorrt_llm
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationResult
 from tensorrt_llm.executor.result import TokenLogprobs
-from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
+from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
                                  GuidedDecodingParams, QuantConfig,
                                  RequestOutput, SamplingParams)
 from tensorrt_llm.llmapi.llm_utils import LlmArgs
diff --git a/tests/unittest/api_stability/test_llm_api.py b/tests/unittest/api_stability/test_llm_api.py
index 014bd8f13ee..1b8fc1dddf0 100644
--- a/tests/unittest/api_stability/test_llm_api.py
+++ b/tests/unittest/api_stability/test_llm_api.py
@@ -5,8 +5,9 @@
 from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot,
                                 MethodSnapshot)
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bindings import executor as tllme
-from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
+from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
                                  GuidedDecodingParams, QuantConfig,
                                  RequestOutput)
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
index 1b075b67565..9d207ae4e9a 100755
--- a/tests/unittest/llmapi/apps/_test_openai_metrics.py
+++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -4,7 +4,7 @@
 from fastapi.testclient import TestClient
 from transformers import AutoTokenizer
 
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
 from tensorrt_llm.serve.openai_server import OpenAIServer
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_chat.py b/tests/unittest/llmapi/apps/_test_openai_multi_chat.py
index c5a755687f9..9ed9a654c52 100644
--- a/tests/unittest/llmapi/apps/_test_openai_multi_chat.py
+++ b/tests/unittest/llmapi/apps/_test_openai_multi_chat.py
@@ -10,8 +10,8 @@
 from utils.util import (skip_gpu_memory_less_than_40gb, skip_pre_ada,
                         skip_single_gpu)
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import BuildConfig
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
 
 from ..test_llm import get_model_path
diff --git a/tests/unittest/llmapi/run_llm.py b/tests/unittest/llmapi/run_llm.py
index 64efd052d46..be0fa122e60 100644
--- a/tests/unittest/llmapi/run_llm.py
+++ b/tests/unittest/llmapi/run_llm.py
@@ -4,7 +4,8 @@
 
 import click
 
-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 
 
 @click.command()
diff --git a/tests/unittest/llmapi/run_llm_with_postproc.py b/tests/unittest/llmapi/run_llm_with_postproc.py
index 0f0cdeebac8..6ee365c952b 100644
--- a/tests/unittest/llmapi/run_llm_with_postproc.py
+++ b/tests/unittest/llmapi/run_llm_with_postproc.py
@@ -6,9 +6,10 @@
 
 import click
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationResultBase
 from tensorrt_llm.executor.postproc_worker import PostprocArgs, PostprocParams
-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.utils import print_colored
 from tensorrt_llm.serve.openai_protocol import (
     ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py
index 9dc204ed95b..ecdb6d9ad25 100644
--- a/tests/unittest/llmapi/test_executor.py
+++ b/tests/unittest/llmapi/test_executor.py
@@ -10,6 +10,7 @@
 import torch
 import zmq
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import mpi_world_size
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
@@ -17,7 +18,7 @@
                                    GenerationResult, GenerationResultBase,
                                    PostprocWorker)
 from tensorrt_llm.executor.ipc import FusedIpcQueue, ZeroMqQueue
-from tensorrt_llm.llmapi import LLM, BuildConfig
+from tensorrt_llm.llmapi import BuildConfig
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.llmapi.utils import AsyncQueue
 from tensorrt_llm.sampling_params import SamplingParams
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 003b69223a5..44aeee9f97d 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -25,10 +25,11 @@
 import transformers
 from utils.util import skip_single_gpu
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
                                    PromptAdapterRequest, RequestError)
-from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig,
+from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
                                  KvCacheConfig, KvCacheRetentionConfig,
                                  LookaheadDecodingConfig, MedusaDecodingConfig,
                                  RequestOutput)
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 83e3b73809c..84da13cdc4b 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -5,8 +5,8 @@
 import yaml
 
 import tensorrt_llm.bindings.executor as tle
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.llm import LLM as TorchLLM
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import *
 from tensorrt_llm.llmapi.utils import print_traceback_on_error
 
diff --git a/tests/unittest/llmapi/test_llm_download.py b/tests/unittest/llmapi/test_llm_download.py
index a1701758ec4..2157919256e 100644
--- a/tests/unittest/llmapi/test_llm_download.py
+++ b/tests/unittest/llmapi/test_llm_download.py
@@ -1,4 +1,4 @@
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.utils import (download_hf_model,
                                        download_hf_pretrained_config)
 
diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py
index b445bd1990b..bdc09323ae3 100644
--- a/tests/unittest/llmapi/test_llm_kv_cache_events.py
+++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py
@@ -2,10 +2,11 @@
 import time
 
 import tensorrt_llm
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import KVCacheEventSerializer
-from tensorrt_llm.llmapi import LLM, KvCacheConfig
+from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index b6c70cb5c7b..b0a6c4bc697 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -8,8 +8,9 @@
 import pytest
 from parameterized import parameterized
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationExecutorProxy
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import PretrainedConfig
diff --git a/tests/unittest/llmapi/test_llm_quant.py b/tests/unittest/llmapi/test_llm_quant.py
index 403bfb7479e..57894da10d1 100644
--- a/tests/unittest/llmapi/test_llm_quant.py
+++ b/tests/unittest/llmapi/test_llm_quant.py
@@ -1,6 +1,7 @@
 import pytest
 
-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
 
 # isort: off
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
index f8ad09e8b91..b5109c8310c 100755
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
+++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
@@ -41,10 +41,10 @@
 from mpi4py.futures import MPICommExecutor
 from mpi4py.MPI import COMM_WORLD
 
+from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
-from tensorrt_llm.llmapi import LLM
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
 
 

From 302d4b1d784c73be49bcd3a8a8263f99e39344bf Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Wed, 18 Jun 2025 14:41:42 +0000
Subject: [PATCH 2/2] fix

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 docs/source/torch.md                          |   6 +-
 docs/source/torch/adding_new_model.md         |   2 +-
 docs/source/torch/arch_overview.md            |   4 +-
 examples/pytorch/out_of_tree_example/main.py  |   2 +-
 examples/pytorch/quickstart.py                |   3 +-
 examples/pytorch/quickstart_advanced.py       |   3 +-
 examples/pytorch/star_attention.py            |   3 +-
 tensorrt_llm/_torch/__init__.py               |   3 +-
 tensorrt_llm/bench/build/build.py             |   2 +-
 tensorrt_llm/llmapi/llm.py                    |  24 +++
 tensorrt_llm/llmapi/llm_args.py               |  10 +-
 .../accuracy/test_disaggregated_serving.py    |   1 -
 .../defs/accuracy/test_llm_api_pytorch.py     |   2 +-
 .../test_disaggregated_single_gpu.py          |   3 +-
 tests/integration/defs/test_e2e.py            |   3 +-
 .../unit/singlegpu/shim/test_llm_config.py    |   3 +-
 .../_torch/modeling/test_modeling_deepseek.py |   3 +-
 .../modeling/test_modeling_nemotron_h.py      |   2 +-
 .../modeling/test_modeling_out_of_tree.py     |   2 +-
 .../_torch/multi_gpu/test_star_attention.py   |   3 +-
 .../multi_gpu_modeling/test_deepseek.py       |   3 +-
 .../_torch/multi_gpu_modeling/test_llama4.py  |   3 +-
 .../_torch/speculative/test_draft_target.py   |   3 +-
 .../_torch/speculative/test_eagle3.py         |   3 +-
 .../unittest/_torch/speculative/test_ngram.py |   3 +-
 .../unittest/_torch/test_overlap_scheduler.py |   3 +-
 tests/unittest/_torch/test_return_logits.py   |   3 +-
 tests/unittest/_torch/test_trtllm_sampler.py  |   3 +-
 .../api_stability/api_stability_core.py       |  10 +-
 .../api_stability/references/llm.yaml         | 159 +++++++++---------
 .../references_committed/llm.yaml             |  11 +-
 tests/unittest/api_stability/test_llm_api.py  |  20 ++-
 .../apps/_test_openai_consistent_chat.py      |   2 +-
 tests/unittest/llmapi/run_llm.py              |   2 +-
 tests/unittest/llmapi/test_llm.py             |  12 +-
 tests/unittest/llmapi/test_llm_args.py        |  10 +-
 tests/unittest/llmapi/test_llm_pytorch.py     |  24 +--
 tests/unittest/llmapi/test_llm_utils.py       |  11 +-
 38 files changed, 194 insertions(+), 175 deletions(-)

diff --git a/docs/source/torch.md b/docs/source/torch.md
index 99305f638d4..da59e90d88c 100644
--- a/docs/source/torch.md
+++ b/docs/source/torch.md
@@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
 
 ## Quick Start
 
-Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
+Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
 
 ```{literalinclude} ../../examples/pytorch/quickstart.py
     :language: python
@@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
 which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 llm.generate("Hello, my name is")
 ```
@@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
 In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
           enable_trtllm_sampler=True)
 sampling_params = SamplingParams(
diff --git a/docs/source/torch/adding_new_model.md b/docs/source/torch/adding_new_model.md
index 4ce5988c99c..63217241e73 100644
--- a/docs/source/torch/adding_new_model.md
+++ b/docs/source/torch/adding_new_model.md
@@ -186,7 +186,7 @@ __all__ = [
 Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 import modeling_mymodel
 
 def main():
diff --git a/docs/source/torch/arch_overview.md b/docs/source/torch/arch_overview.md
index f48403d2d85..11b12781cea 100644
--- a/docs/source/torch/arch_overview.md
+++ b/docs/source/torch/arch_overview.md
@@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
 
 ## Top Level API
 
-The interface for PyTorch backend is `tensorrt._torch.LLM`.
+The interface for PyTorch backend is `tensorrt_llm.LLM`.
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model=<path_to_llama_from_hf>)
 ```
 
diff --git a/examples/pytorch/out_of_tree_example/main.py b/examples/pytorch/out_of_tree_example/main.py
index 430bed126f3..afa943c3422 100644
--- a/examples/pytorch/out_of_tree_example/main.py
+++ b/examples/pytorch/out_of_tree_example/main.py
@@ -1,6 +1,6 @@
 import modeling_opt  # noqa
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 
 
 def main():
diff --git a/examples/pytorch/quickstart.py b/examples/pytorch/quickstart.py
index 9c81a965e5a..b4f313ff192 100644
--- a/examples/pytorch/quickstart.py
+++ b/examples/pytorch/quickstart.py
@@ -1,5 +1,4 @@
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 
 
 def main():
diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py
index 6e755ba8a3b..29c9bb10186 100644
--- a/examples/pytorch/quickstart_advanced.py
+++ b/examples/pytorch/quickstart_advanced.py
@@ -1,7 +1,6 @@
 import argparse
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
                                  KvCacheConfig, MTPDecodingConfig,
                                  NGramDecodingConfig, TorchCompileConfig)
diff --git a/examples/pytorch/star_attention.py b/examples/pytorch/star_attention.py
index fe861ad4f46..e6071054fe4 100644
--- a/examples/pytorch/star_attention.py
+++ b/examples/pytorch/star_attention.py
@@ -6,8 +6,7 @@
 
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 
diff --git a/tensorrt_llm/_torch/__init__.py b/tensorrt_llm/_torch/__init__.py
index 7c2d021b1c4..7d2de6d643c 100644
--- a/tensorrt_llm/_torch/__init__.py
+++ b/tensorrt_llm/_torch/__init__.py
@@ -1,3 +1,4 @@
 from .llm import LLM
+from .model_config import MoeLoadBalancerConfig
 
-__all__ = ["LLM"]
+__all__ = ["LLM", "MoeLoadBalancerConfig"]
diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py
index e3bd6cbef56..1ea0add7ab1 100644
--- a/tensorrt_llm/bench/build/build.py
+++ b/tensorrt_llm/bench/build/build.py
@@ -9,7 +9,7 @@
 from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
 from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index a1f1634f1b5..5635d4016f6 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -97,6 +97,7 @@ def _repr_fields(self):
 
     Attributes:
         tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
+        llm_id (str): The unique ID of the LLM instance.
 """
 
 
@@ -883,6 +884,9 @@ def __init__(self,
         # TODO: deprecate backend in LLM kwargs
         kwargs.pop("backend", None)
 
+        # Validate that users don't pass TrtLlmArgs-specific arguments
+        self._validate_args_for_torch_backend(kwargs)
+
         super().__init__(model,
                          tokenizer,
                          tokenizer_mode,
@@ -895,6 +899,26 @@ def __init__(self,
                          backend='pytorch',
                          **kwargs)
 
+    def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
+        """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
+        """
+        trtllm_fields = set(TrtLlmArgs.model_fields.keys())
+        torchllm_fields = set(TorchLlmArgs.model_fields.keys())
+
+        trtllm_specific_fields = trtllm_fields - torchllm_fields
+
+        # Check if any TrtLlmArgs-specific arguments are passed
+        trtllm_specific_args = []
+        for key in kwargs:
+            if key in trtllm_specific_fields:
+                trtllm_specific_args.append(key)
+
+        if trtllm_specific_args:
+            raise ValueError(
+                f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n"
+                f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend."
+            )
+
 
 class LLM(_TorchLLM):
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index dc74ad95ea3..228be0aed67 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1591,9 +1591,6 @@ def validate_enable_build_cache(self):
         return self
 
 
-LlmArgs = TrtLlmArgs
-
-
 class LoadFormat(Enum):
     AUTO = 0
     # Initialize all weights randomly.
@@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs):
     moe_load_balancer: Optional[Union[object, str]] = Field(
         default=None,
         description="Configuration for MoE load balancing.",
-        json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
+        json_schema_extra={
+            "type":
+            "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
+        })
 
     attn_backend: str = Field(default='TRTLLM',
                               description="Attention backend to use.")
@@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
         return model_format
 
 
+LlmArgs = TorchLlmArgs
+
 TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
                                                                 indent=' ' * 4)
 TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 30d5d55b325..ab5481913f4 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
         yaml.dump(gen_server_config, f)
 
     args = LlmArgs.from_kwargs(model=model_name,
-                               backend="pytorch",
                                tensor_parallel_size=tensor_parallel_size)
 
     trtllm_serve_path = "trtllm-serve"
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 628ca46a140..e9fa70cb338 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import pytest
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
 from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  MTPDecodingConfig, NGramDecodingConfig,
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index 6d8651cfae7..71e5744fdf4 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -9,8 +9,7 @@
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
 
-from tensorrt_llm import DisaggregatedParams, SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession
 
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index fe314ee4beb..22f6035d956 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
     import torch
     from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-    from tensorrt_llm import SamplingParams
-    from tensorrt_llm._torch import LLM
+    from tensorrt_llm import LLM, SamplingParams
     from tensorrt_llm.sampling_params import SamplingParams
     prompts = [
         "Hello, my name is",
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
index a256d1f57a5..0833f145dc1 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
@@ -128,7 +128,8 @@ def test_config_flow(
 
     # Create instance with appropriate mocking
     with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()):
-        instance = api_class(**config_params)
+        with patch.object(api_class, "_build_model", return_value=MagicMock()):
+            instance = api_class(**config_params)
 
     # Verify args were created correctly
     assert hasattr(instance, "args")
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
index d18ca41de0f..ae6907f30ca 100644
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -7,8 +7,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
index 5f3ccce3c7b..dd89a7dd0e9 100644
--- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -2,7 +2,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import skip_gpu_memory_less_than
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
index eb1bc220f12..f6dcfdf39b3 100644
--- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
+++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
@@ -2,7 +2,7 @@
 
 from parameterized import parameterized
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py
index 7c387cd5ccf..9cf12a2c281 100644
--- a/tests/unittest/_torch/multi_gpu/test_star_attention.py
+++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@@ -5,8 +5,7 @@
 import torch
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index b73655719fa..678f91880b7 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -7,8 +7,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
index 707e1936ac8..017452a7d63 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -4,8 +4,7 @@
 import torch
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
index 6802457f589..4c230a431c9 100644
--- a/tests/unittest/_torch/speculative/test_draft_target.py
+++ b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index 48540c708f7..d0d4c424ed1 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
index 6db927454b8..0e3e227d7fd 100644
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py
index 18622f94cbd..be105e96e94 100644
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@@ -4,8 +4,7 @@
 import pytest
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py
index 2fa21ad4179..a9e0b1a430f 100644
--- a/tests/unittest/_torch/test_return_logits.py
+++ b/tests/unittest/_torch/test_return_logits.py
@@ -5,8 +5,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import force_ampere
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
 
 prompts = ["A B C"]
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py
index bee47efddaf..c4493c266c8 100644
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/test_trtllm_sampler.py
@@ -5,8 +5,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import similar
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py
index e0cde0e4af4..1014f1a22fa 100644
--- a/tests/unittest/api_stability/api_stability_core.py
+++ b/tests/unittest/api_stability/api_stability_core.py
@@ -17,7 +17,7 @@
 from pydantic import BaseModel
 
 import tensorrt_llm
-from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.executor import GenerationResult
 from tensorrt_llm.executor.result import TokenLogprobs
 from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
@@ -367,8 +367,14 @@ def assert_equal(self, other: 'ClassSnapshot'):
         if self.properties.keys() != other.properties.keys():
             diff_keys = set(self.properties.keys()) ^ set(
                 other.properties.keys())
+            this_diff_keys = set(self.properties.keys()) - set(
+                other.properties.keys())
+            other_diff_keys = set(other.properties.keys()) - set(
+                self.properties.keys())
             raise AssertionError(
-                f"{qual_name} has different properties: {diff_keys}")
+                f"{qual_name} has different properties: {diff_keys}\n"
+                f"This class has extra properties: {this_diff_keys}\n"
+                f"The reference has extra properties: {other_diff_keys}")
 
         for name, prop in self.properties.items():
             with StackTrace().push(name):
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index a388a141532..801f624c1ce 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -1,90 +1,45 @@
 methods:
   __init__:
     parameters:
-     # Parallelism
+      # Parallelism
+      gpus_per_node:
+        annotation: Optional[int]
+        default: null
+      moe_cluster_parallel_size:
+        annotation: Optional[int]
+        default: null
+      enable_attention_dp:
+        annotation: bool
+        default: False
       cp_config:
         annotation: Optional[dict]
         default: null
-      auto_parallel:
-        annotation: bool
-        default: false
-      auto_parallel_world_size:
+      # Stats
+      iter_stats_max_iterations:
         annotation: Optional[int]
         default: null
-      embedding_parallel_mode:
-        annotation: str
-        default: SHARDING_ALONG_VOCAB
-      moe_cluster_parallel_size:
+      request_stats_max_iterations:
         annotation: Optional[int]
         default: null
-      # Engine building
-      build_config:
-        annotation: Optional[tensorrt_llm.builder.BuildConfig]
-        default: null
-      enable_build_cache:
-        annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
-        default: false
-      fast_build:
-        annotation: bool
-        default: false
       # Bindings and mirrored configs
-      batching_type:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
-        default: null
       peft_cache_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
         default: null
       scheduler_config:
         annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
         default: null
-      extended_runtime_perf_knob_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
-        default: null
-      decoding_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
-        default: null
       cache_transceiver_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
         default: null
-      # Misc
-      backend:
-        annotation: Optional[str]
+      batching_type:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
         default: null
-      enable_attention_dp:
-        annotation: bool
-        default: false
       normalize_log_probs:
         annotation: bool
-        default: false
+        default: False
       gather_generation_logits:
         annotation: bool
-        default: false
-      gpus_per_node:
-        annotation: Optional[int]
-        default: null
-      iter_stats_max_iterations:
-        annotation: Optional[int]
-        default: null
-      request_stats_max_iterations:
-        annotation: Optional[int]
-        default: null
-      workspace:
-        annotation: Optional[str]
-        default: null
-      # LoRA
-      max_lora_rank:
-        annotation: Optional[int]
-        default: null
-      max_loras:
-        annotation: int
-        default: 4
-      max_cpu_loras:
-        annotation: int
-        default: 4
-      allreduce_strategy:
-        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
-        default: AUTO
-      # postproc worker
+        default: False
       num_postprocess_workers:
         annotation: int
         default: 0
@@ -98,10 +53,73 @@ methods:
       reasoning_parser:
         annotation: Optional[str]
         default: null
-      # kwargs
-      kwargs:
-        annotation: Any
-        default: inspect._empty
+      garbage_collection_gen0_threshold:
+        annotation: int
+        default: 20000
+      # Misc
+      backend:
+        annotation: Optional[str]
+        default: null
+      build_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
+        default: null
+      use_cuda_graph:
+        annotation: bool
+        default: False
+      cuda_graph_batch_sizes:
+        annotation: Optional[List[int]]
+        default: null
+      cuda_graph_max_batch_size:
+        annotation: int
+        default: 0
+      cuda_graph_padding_enabled:
+        annotation: bool
+        default: False
+      disable_overlap_scheduler:
+        annotation: bool
+        default: False
+      moe_max_num_tokens:
+        annotation: Optional[int]
+        default: null
+      moe_load_balancer:
+        annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None]
+        default: null
+      attn_backend:
+        annotation: str
+        default: TRTLLM
+      moe_backend:
+        annotation: str
+        default: CUTLASS
+      mixed_sampler:
+        annotation: bool
+        default: False
+      enable_trtllm_sampler:
+        annotation: bool
+        default: False
+      kv_cache_dtype:
+        annotation: str
+        default: auto
+      enable_iter_perf_stats:
+        annotation: bool
+        default: False
+      enable_iter_req_stats:
+        annotation: bool
+        default: False
+      print_iter_log:
+        annotation: bool
+        default: False
+      torch_compile_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
+        default: null
+      autotuner_enabled:
+        annotation: bool
+        default: True
+      enable_layerwise_nvtx_marker:
+        annotation: bool
+        default: False
+      enable_min_latency:
+        annotation: bool
+        default: False
     return_annotation: None
   generate:
     parameters:
@@ -145,19 +163,10 @@ methods:
         annotation: Optional[float]
         default: 2
     return_annotation: tensorrt_llm.executor.result.IterationResult
-  save:
-    parameters:
-      engine_dir:
-        annotation: str
-        default: inspect._empty
-    return_annotation: None
   shutdown:
     parameters: {}
     return_annotation: None
 properties:
-  workspace:
-    annotation: pathlib.Path
-    default: inspect._empty
   llm_id:
     annotation: str
     default: inspect._empty
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index e74e6b8d840..a30e62645fe 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -95,8 +95,8 @@ methods:
         default: null
       # Misc
       load_format:
-        annotation: Literal['auto', 'dummy']
-        default: auto
+        annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]
+        default: 0
       enable_tqdm:
         annotation: bool
         default: false
@@ -106,9 +106,10 @@ methods:
       kv_cache_config:
         annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
         default: null
-      garbage_collection_gen0_threshold:
-        annotation: int
-        default: 20000
+
+      kwargs:
+        annotation: Any
+        default: inspect._empty
     return_annotation: None
   generate:
     parameters:
diff --git a/tests/unittest/api_stability/test_llm_api.py b/tests/unittest/api_stability/test_llm_api.py
index 1b8fc1dddf0..6960f993286 100644
--- a/tests/unittest/api_stability/test_llm_api.py
+++ b/tests/unittest/api_stability/test_llm_api.py
@@ -5,8 +5,9 @@
 from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot,
                                 MethodSnapshot)
 
-from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.bindings import executor as tllme
+from tensorrt_llm.executor.result import IterationResult
 from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
                                  GuidedDecodingParams, QuantConfig,
                                  RequestOutput)
@@ -131,21 +132,28 @@ def test_new_method(self, mocker):
 
     def test_modified_method_with_same_signature(self, mocker):
 
-        def new_save(self, engine_dir: str) -> None:
+        def new_get_stats_async(self,
+                                timeout: Optional[float] = 2
+                                ) -> IterationResult:
             pass
 
-        new_save.__doc__ = self.TEST_CLASS.save.__doc__
+        new_get_stats_async.__doc__ = self.TEST_CLASS.get_stats_async.__doc__
 
-        mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
+        mocker.patch.object(self.TEST_CLASS,
+                            "get_stats_async",
+                            new=new_get_stats_async)
         self.test_signature()
         self.test_docstring()
 
     def test_modified_method_with_modified_signature(self, mocker):
 
-        def new_save(self, engine_dir: Optional[str]) -> None:
+        def new_get_stats_async(self,
+                                timeout: Optional[int] = 2) -> IterationResult:
             pass
 
-        mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
+        mocker.patch.object(self.TEST_CLASS,
+                            "get_stats_async",
+                            new=new_get_stats_async)
         with pytest.raises(AssertionError):
             self.test_signature()
         with pytest.raises(AssertionError):
diff --git a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
index 6c4ed3f1efd..a3e716cd40e 100644
--- a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
+++ b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
@@ -9,8 +9,8 @@
 from utils.util import (skip_gpu_memory_less_than_40gb, skip_num_gpus_less_than,
                         skip_nvlink_inactive)
 
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import BuildConfig
-from tensorrt_llm.llmapi.llm import LLM
 
 from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
diff --git a/tests/unittest/llmapi/run_llm.py b/tests/unittest/llmapi/run_llm.py
index be0fa122e60..5b864360b3f 100644
--- a/tests/unittest/llmapi/run_llm.py
+++ b/tests/unittest/llmapi/run_llm.py
@@ -4,6 +4,7 @@
 
 import click
 
+from tensorrt_llm import LLM as TorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 
@@ -21,7 +22,6 @@ def main(model_dir: str, tp_size: int, engine_dir: Optional[str], n: int,
          best_of: Optional[int], top_k: int, use_beam_search: bool,
          use_pytorch: bool):
     if use_pytorch:
-        from tensorrt_llm._torch.llm import LLM as TorchLLM
         llm = TorchLLM(
             model_dir,
             tensor_parallel_size=tp_size,
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 44aeee9f97d..33a458a1234 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -25,6 +25,7 @@
 import transformers
 from utils.util import skip_single_gpu
 
+from tensorrt_llm import LLM as LLM_torch
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
@@ -33,9 +34,10 @@
                                  KvCacheConfig, KvCacheRetentionConfig,
                                  LookaheadDecodingConfig, MedusaDecodingConfig,
                                  RequestOutput)
+from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
 from tensorrt_llm.llmapi.llm_args import DynamicBatchConfig, SchedulerConfig
-from tensorrt_llm.llmapi.llm_utils import (BuildConfig, LlmArgs, QuantAlgo,
-                                           QuantConfig, _ParallelConfig)
+from tensorrt_llm.llmapi.llm_utils import (BuildConfig, QuantAlgo, QuantConfig,
+                                           _ParallelConfig)
 from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.lora_manager import LoraConfig
@@ -119,7 +121,6 @@ def llm_test_harness(model_dir: str,
         tokenizer = model_dir
 
     if backend == "pytorch":
-        from tensorrt_llm._torch import LLM as LLM_torch
         llm = LLM_torch(model_dir, tokenizer=tokenizer, **llm_kwargs)
     else:
         llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs)
@@ -1597,7 +1598,6 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int],
     LLM_CLASS = LLM
     llm_args_extra = {}
     if backend in ["pytorch", "autodeploy"]:
-        from tensorrt_llm._torch import LLM as LLM_torch
         LLM_CLASS = LLM_torch
     else:
         llm_args_extra["fast_build"] = True
@@ -1840,7 +1840,6 @@ def llm_get_stats_test_harness(tp_size: int = 1,
         sampling_args_extra["return_context_logits"] = True
 
     if pytorch_backend:
-        from tensorrt_llm._torch import LLM as LLM_torch
         llm_args_extra.update(
             dict(enable_iter_perf_stats=True,
                  enable_iter_req_stats=enable_iter_req_stats,
@@ -1895,8 +1894,6 @@ def test_llm_get_queued_stats():
     llm_args_extra = {}
     sampling_args_extra = {}
 
-    from tensorrt_llm._torch import LLM as LLM_torch
-
     llm_args_extra.update(
         dict(enable_iter_perf_stats=True,
              enable_iter_req_stats=enable_iter_req_stats,
@@ -1968,7 +1965,6 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
         sampling_args_extra["return_context_logits"] = True
 
     if pytorch_backend:
-        from tensorrt_llm._torch import LLM as LLM_torch
         llm_args_extra.update(
             dict(enable_iter_perf_stats=True,
                  enable_iter_req_stats=enable_iter_req_stats,
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 84da13cdc4b..8406dfc8f31 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -5,8 +5,8 @@
 import yaml
 
 import tensorrt_llm.bindings.executor as tle
+from tensorrt_llm import LLM as TorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
-from tensorrt_llm._torch.llm import LLM as TorchLLM
 from tensorrt_llm.llmapi.llm_args import *
 from tensorrt_llm.llmapi.utils import print_traceback_on_error
 
@@ -54,10 +54,10 @@ def test_update_llm_args_with_extra_dict_with_speculative_config():
         f.seek(0)
         dict_content = yaml.safe_load(f)
 
-    llm_args = LlmArgs(model=llama_model_path)
+    llm_args = TrtLlmArgs(model=llama_model_path)
     llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
                                                     dict_content)
-    llm_args = LlmArgs(**llm_args_dict)
+    llm_args = TrtLlmArgs(**llm_args_dict)
     assert llm_args.speculative_config.max_window_size == 4
     assert llm_args.speculative_config.max_ngram_size == 3
     assert llm_args.speculative_config.max_verification_set_size == 4
@@ -226,10 +226,10 @@ class TestTrtLlmArgs:
 
     def test_dynamic_setattr(self):
         with pytest.raises(pydantic_core._pydantic_core.ValidationError):
-            args = LlmArgs(model=llama_model_path, invalid_arg=1)
+            args = TrtLlmArgs(model=llama_model_path, invalid_arg=1)
 
         with pytest.raises(ValueError):
-            args = LlmArgs(model=llama_model_path)
+            args = TrtLlmArgs(model=llama_model_path)
             args.invalid_arg = 1
 
 
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 65f3d16ac69..b5f32547ce1 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -1,5 +1,6 @@
 import pytest
 
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -71,9 +72,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
         SamplingParams()  # pytorch only supports n=1
     ])
 def test_llm_abort_request(sampling_params):
-    from tensorrt_llm._torch import LLM as LLM_torch
-    llm = LLM_torch(model=llama_model_path,
-                    kv_cache_config=global_kvcache_config)
+    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
     run_llm_abort_request(llm=llm, sampling_params=sampling_params)
 
 
@@ -82,10 +81,9 @@ def test_llm_reward_model():
     tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
     tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
 
-    from tensorrt_llm._torch import LLM as LLM_torch
-    llm = LLM_torch(model=rm_model_path,
-                    attn_backend="VANILLA",
-                    disable_overlap_scheduler=True)
+    llm = LLM(model=rm_model_path,
+              attn_backend="VANILLA",
+              disable_overlap_scheduler=True)
 
     sampling_params = SamplingParams(return_context_logits=True)
 
@@ -106,8 +104,6 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming):
 
 
 def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     lora_config = LoraConfig(lora_dir=[
         f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b"
     ],
@@ -134,8 +130,6 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
 
 
 def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
     hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"
     hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0"
@@ -181,8 +175,6 @@ def test_llama_v2_13b_lora():
 
 @skip_gpu_memory_less_than_40gb
 def test_llama_7b_lora_default_modules() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     lora_config = LoraConfig(max_lora_rank=64)
 
     hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
@@ -214,8 +206,6 @@ def test_llama_7b_multi_lora():
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
 @skip_gpu_memory_less_than_138gb
 def test_nemotron_nas_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     lora_config = LoraConfig(lora_dir=[
         f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
     ],
@@ -248,8 +238,6 @@ def test_nemotron_nas_lora() -> None:
 
 @skip_gpu_memory_less_than_80gb
 def test_codellama_fp8_with_bf16_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     model_dir = f"{llm_models_root()}/codellama/CodeLlama-7b-Instruct-hf/"
     quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
                                kv_cache_quant_algo=QuantAlgo.FP8)
@@ -308,8 +296,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:
 
 @skip_gpu_memory_less_than_80gb
 def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
     model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
 
     target_modules = ['attn_q', 'attn_k', 'attn_v']
diff --git a/tests/unittest/llmapi/test_llm_utils.py b/tests/unittest/llmapi/test_llm_utils.py
index ac50a8fbf03..7caa16d7001 100644
--- a/tests/unittest/llmapi/test_llm_utils.py
+++ b/tests/unittest/llmapi/test_llm_utils.py
@@ -37,9 +37,10 @@ def build_engine():
 
 def test_CachedModelLoader():
     # CachedModelLoader enables engine caching and multi-gpu building
-    args = LlmArgs(model=llama_model_path,
-                   kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
-                   enable_build_cache=True)
+    args = TrtLlmArgs(
+        model=llama_model_path,
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
+        enable_build_cache=True)
     stats = LlmBuildStats()
     model_loader = CachedModelLoader(args, llm_build_stats=stats)
     engine_dir, _ = model_loader()
@@ -51,9 +52,9 @@ def test_CachedModelLoader():
 
 def test_LlmArgs_default_gpus_per_node():
     # default
-    llm_args = LlmArgs(model=llama_model_path)
+    llm_args = TrtLlmArgs(model=llama_model_path)
     assert llm_args.gpus_per_node == torch.cuda.device_count()
 
     # set explicitly
-    llm_args = LlmArgs(model=llama_model_path, gpus_per_node=6)
+    llm_args = TrtLlmArgs(model=llama_model_path, gpus_per_node=6)
     assert llm_args.gpus_per_node == 6