chore: partition LLM class into TorchLLM and TrtLLM (#4900)

Superjomn · web-flow · commit 724e49525411 · 2025-06-18T14:01:25.000+08:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/docs/source/helper.py b/docs/source/helper.py
@@ -149,11 +149,18 @@ def generate_llmapi():
     content = underline("API Reference", "-") + "\n\n"
     for cls_name in public_classes_names:
         cls_name = cls_name.strip()
-        content += (f".. autoclass:: tensorrt_llm.llmapi.{cls_name}\n"
-                    "    :members:\n"
-                    "    :undoc-members:\n"
-                    "    :special-members: __init__\n"
-                    "    :show-inheritance:\n")
+        options = [
+            "    :members:", "    :undoc-members:", "    :show-inheritance:"
+        ]
+
+        if cls_name != 'LLM':  # Conditionally add :special-members: __init__
+            options.append("    :special-members: __init__")
+
+        if cls_name in ['TrtLLM', 'TorchLLM', 'LLM']:
+            options.append("    :inherited-members:")
+
+        content += f".. autoclass:: tensorrt_llm.llmapi.{cls_name}\n"
+        content += "\n".join(options) + "\n\n"
 
     with open(doc_path, "w+") as f:
         f.write(content)
diff --git a/tensorrt_llm/_torch/llm.py b/tensorrt_llm/_torch/llm.py
@@ -1,29 +1,3 @@
-from pathlib import Path
-from typing import Any, Literal, Optional, Union
+from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
 
-from transformers import PreTrainedTokenizerBase
-
-from ..llmapi.llm import LLM as BaseLLM
-from ..llmapi.llm import TokenizerBase
-
-
-class LLM(BaseLLM):
-
-    def __init__(self,
-                 model: str,
-                 tokenizer: Optional[Union[str, Path, TokenizerBase,
-                                           PreTrainedTokenizerBase]] = None,
-                 tokenizer_mode: Literal['auto', 'slow'] = 'auto',
-                 skip_tokenizer_init: bool = False,
-                 trust_remote_code: bool = False,
-                 tensor_parallel_size: int = 1,
-                 dtype: str = "auto",
-                 revision: Optional[str] = None,
-                 tokenizer_revision: Optional[str] = None,
-                 **kwargs: Any):
-
-        kwargs_dict = dict(kwargs)
-        kwargs_dict['backend'] = 'pytorch'
-        super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
-                         trust_remote_code, tensor_parallel_size, dtype,
-                         revision, tokenizer_revision, **kwargs_dict)
+__all__ = ['LLM']
diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py
@@ -2,7 +2,7 @@
 from ..executor import CompletionOutput, RequestError
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .build_cache import BuildCacheConfig
-from .llm import LLM, RequestOutput
+from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM
 # yapf: disable
 from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
                        CapacitySchedulerPolicy, ContextChunkingPolicy,
@@ -50,4 +50,6 @@
     'LlmArgs',
     'TorchLlmArgs',
     'TrtLlmArgs',
+    '_TrtLLM',
+    '_TorchLLM',
 ]
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -31,8 +31,9 @@
                       create_input_processor_with_hash, prompt_inputs)
 from ..logger import logger
 from ..sampling_params import SamplingParams
-from .llm_args import (LLMARGS_EXPLICIT_DOCSTRING, PybindMirror, TorchLlmArgs,
-                       TrtLlmArgs, _AutoDeployLlmArgs)
+from .llm_args import (TORCH_LLMARGS_EXPLICIT_DOCSTRING,
+                       TRT_LLMARGS_EXPLICIT_DOCSTRING, PybindMirror,
+                       TorchLlmArgs, TrtLlmArgs, _AutoDeployLlmArgs)
 from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
                         LlmBuildStats, ModelLoader, _ModelRuntimeContext)
 from .mpi_session import MpiPoolSession, external_mpi_comm_available
@@ -83,23 +84,26 @@ def _repr_fields(self):
         ]
 
 
-LLM_DOCSTRING = LLMARGS_EXPLICIT_DOCSTRING + """
-        kwargs (Any): Advanced arguments passed to `LlmArgs`.
+TRT_LLM_DOCSTRING = TRT_LLMARGS_EXPLICIT_DOCSTRING + """
 
     Attributes:
         tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
         workspace (pathlib.Path): The directory to store intermediate files.
         llm_id (str): The unique ID of the LLM instance.
 """
 
+TORCH_LLM_DOCSTRING = TORCH_LLMARGS_EXPLICIT_DOCSTRING + """
 
-@append_docstring(LLM_DOCSTRING)
-class LLM:
-    """LLM class is the main class for running a LLM model.
-
-    Parameters:
+    Attributes:
+        tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
 """
 
+
+class BaseLLM:
+    """
+    The base class for all LLM classes.
+    """
+
     def __init__(self,
                  model: Union[str, Path],
                  tokenizer: Optional[Union[str, Path, TokenizerBase,
@@ -186,6 +190,8 @@ def __init__(self,
             if self._on_trt_backend:
                 self._workspace = tempfile.TemporaryDirectory(
                     suffix="-llm-workspace", dir=self.args.workspace)
+            else:
+                self._workspace = None
 
             self._hf_model_dir: Optional[Path] = None
 
@@ -202,10 +208,6 @@ def __init__(self,
         exception_handler.register(self, 'shutdown')
         atexit.register(LLM._shutdown_wrapper, weakref.ref(self))
 
-    @property
-    def workspace(self) -> Path:
-        return Path(self._workspace.name) if self._on_trt_backend else None
-
     @property
     def llm_id(self) -> str:
         if self._llm_id is None:
@@ -584,7 +586,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
     def _build_model(self):
         model_loader = CachedModelLoader(self.args,
                                          mpi_session=self.mpi_session,
-                                         workspace=self.workspace,
+                                         workspace=self._workspace,
                                          llm_build_stats=weakref.proxy(
                                              self.llm_build_stats))
         self._engine_dir, self._hf_model_dir = model_loader()
@@ -766,6 +768,66 @@ def tokenizer(self) -> Optional[TokenizerBase]:
     def tokenizer(self, tokenizer: TokenizerBase):
         self._tokenizer = tokenizer
 
+    def shutdown(self) -> None:
+        if hasattr(self, "_executor") and self._executor is not None:
+            self._executor.shutdown()
+            self._executor = None
+
+        if hasattr(self, 'mpi_session') and self.mpi_session is not None:
+            self.mpi_session.shutdown()
+            self.mpi_session = None
+
+    @staticmethod
+    def _shutdown_wrapper(self_ref):
+        # Retrieve the instance if it still exists
+        instance = self_ref()
+        if instance is not None:
+            instance.shutdown()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> bool:
+        del exc_value, traceback
+        self.shutdown()
+        return False  # propagate exceptions
+
+    def __getstate__(self):
+        raise RuntimeError("LLM object can not be pickled.")
+
+    def __del__(self):
+        self.shutdown()
+
+
+@append_docstring(TRT_LLM_DOCSTRING)
+class _TrtLLM(BaseLLM):
+    """LLM class is the main class for running a LLM model using TensorRT-LLM backend.
+
+    Parameters:
+"""
+
+    def __init__(self,
+                 model: Union[str, Path],
+                 tokenizer: Optional[Union[str, Path, TokenizerBase,
+                                           PreTrainedTokenizerBase]] = None,
+                 tokenizer_mode: Literal['auto', 'slow'] = 'auto',
+                 skip_tokenizer_init: bool = False,
+                 trust_remote_code: bool = False,
+                 tensor_parallel_size: int = 1,
+                 dtype: str = "auto",
+                 revision: Optional[str] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 **kwargs: Any) -> None:
+        # TODO: deprecate backend in LLM kwargs
+
+        super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
+                         trust_remote_code, tensor_parallel_size, dtype,
+                         revision, tokenizer_revision, **kwargs)
+
+    @property
+    def workspace(self) -> Path:
+        return Path(self._workspace.name) if self._on_trt_backend else None
+
     def save(self, engine_dir: str) -> None:
         """Save the built engine to the given path.
 
@@ -791,32 +853,71 @@ def save(self, engine_dir: str) -> None:
                     f"Copying {file} to {target_engine_dir / file.name}\n")
                 shutil.copy(file, target_engine_dir / file.name)
 
-    def shutdown(self) -> None:
-        if hasattr(self, "_executor") and self._executor is not None:
-            self._executor.shutdown()
-            self._executor = None
 
-        if hasattr(self, 'mpi_session') and self.mpi_session is not None:
-            self.mpi_session.shutdown()
-            self.mpi_session = None
+@append_docstring(TORCH_LLM_DOCSTRING)
+class _TorchLLM(BaseLLM):
+    """LLM class is the main class for running a LLM model using PyTorch backend.
 
-    @staticmethod
-    def _shutdown_wrapper(self_ref):
-        # Retrieve the instance if it still exists
-        instance = self_ref()
-        if instance is not None:
-            instance.shutdown()
+    Parameters:
+"""
 
-    def __enter__(self):
-        return self
+    def __init__(self,
+                 model: Union[str, Path],
+                 tokenizer: Optional[Union[str, Path, TokenizerBase,
+                                           PreTrainedTokenizerBase]] = None,
+                 tokenizer_mode: Literal['auto', 'slow'] = 'auto',
+                 skip_tokenizer_init: bool = False,
+                 trust_remote_code: bool = False,
+                 tensor_parallel_size: int = 1,
+                 dtype: str = "auto",
+                 revision: Optional[str] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 **kwargs: Any) -> None:
 
-    def __exit__(self, exc_type, exc_value, traceback) -> bool:
-        del exc_value, traceback
-        self.shutdown()
-        return False  # propagate exceptions
+        # TODO: deprecate backend in LLM kwargs
+        kwargs.pop("backend", None)
 
-    def __getstate__(self):
-        raise RuntimeError("LLM object can not be pickled.")
+        super().__init__(model,
+                         tokenizer,
+                         tokenizer_mode,
+                         skip_tokenizer_init,
+                         trust_remote_code,
+                         tensor_parallel_size,
+                         dtype,
+                         revision,
+                         tokenizer_revision,
+                         backend='pytorch',
+                         **kwargs)
 
-    def __del__(self):
-        self.shutdown()
+
+class LLM(_TrtLLM):
+
+    def __init__(self,
+                 model: Union[str, Path],
+                 tokenizer: Optional[Union[str, Path, TokenizerBase,
+                                           PreTrainedTokenizerBase]] = None,
+                 tokenizer_mode: Literal['auto', 'slow'] = 'auto',
+                 skip_tokenizer_init: bool = False,
+                 trust_remote_code: bool = False,
+                 tensor_parallel_size: int = 1,
+                 dtype: str = "auto",
+                 revision: Optional[str] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 **kwargs: Any) -> None:
+        super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
+                         trust_remote_code, tensor_parallel_size, dtype,
+                         revision, tokenizer_revision, **kwargs)
+
+
+_LLM_REPR = "TrtLLM"
+
+# sphinx will ignore the LLM's docstring if it is not explicitly set
+LLM.__doc__ = \
+    f"""LLM class is the main class for running a LLM model.
+
+    This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend
+    and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0.
+    The default backend is the TensorRT backend.
+
+    Parameters:
+""" + TRT_LLM_DOCSTRING
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1593,9 +1593,6 @@ def validate_enable_build_cache(self):
 
 LlmArgs = TrtLlmArgs
 
-LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(LlmArgs,
-                                                            indent=' ' * 4)
-
 
 class LoadFormat(Enum):
     AUTO = 0
@@ -2068,3 +2065,10 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
         )
     else:
         return model_format
+
+
+TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
+                                                                indent=' ' * 4)
+TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
+                                                                  indent=' ' *
+                                                                  4)