Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/torch.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You

## Quick Start

Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.

```{literalinclude} ../../examples/pytorch/quickstart.py
:language: python
Expand All @@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).

```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
llm.generate("Hello, my name is")
```
Expand All @@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:

```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
enable_trtllm_sampler=True)
sampling_params = SamplingParams(
Expand Down
2 changes: 1 addition & 1 deletion docs/source/torch/adding_new_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ __all__ = [
Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:

```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
import modeling_mymodel

def main():
Expand Down
4 changes: 2 additions & 2 deletions docs/source/torch/arch_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d

## Top Level API

The interface for PyTorch backend is `tensorrt._torch.LLM`.
The interface for PyTorch backend is `tensorrt_llm.LLM`.

```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model=<path_to_llama_from_hf>)
```

Expand Down
3 changes: 2 additions & 1 deletion examples/apps/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import colorama
from transformers import AutoTokenizer, PreTrainedTokenizer

from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams


class LlmConsole(code.InteractiveConsole):
Expand Down
3 changes: 2 additions & 1 deletion examples/apps/fastapi_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse

from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import CppExecutorError, RequestError
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams

TIMEOUT_KEEP_ALIVE = 5 # seconds.

Expand Down
3 changes: 2 additions & 1 deletion examples/auto_deploy/build_and_run_ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import torch
from simple_config import SimpleConfig

from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
from tensorrt_llm.llmapi.llm import LLM, RequestOutput
from tensorrt_llm.llmapi.llm import RequestOutput
from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
from tensorrt_llm.sampling_params import SamplingParams

Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_auto_parallel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
### Automatic Parallelism with LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
4 changes: 2 additions & 2 deletions examples/llm-api/llm_eagle2_decoding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
### Generate Text Using Eagle2 Decoding

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
SamplingParams)


Expand Down
6 changes: 3 additions & 3 deletions examples/llm-api/llm_eagle_decoding.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
### Generate Text Using Eagle Decoding

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
SamplingParams)
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_guided_decoding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
### Generate text with guided decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import GuidedDecodingParams


Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
### Generate text
import tempfile

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference_async.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
### Generate Text Asynchronously
import asyncio

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference_async_streaming.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
### Generate Text in Streaming
import asyncio

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference_customize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
### Generate text with customization
import tempfile

from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference_distributed.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
### Distributed LLM Generation
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_inference_kv_events.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
### Get KV Cache Events

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import KvCacheConfig


Expand Down
2 changes: 1 addition & 1 deletion examples/llm-api/llm_logits_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch

from tensorrt_llm import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
LogitsProcessor, SamplingParams)

Expand Down
4 changes: 2 additions & 2 deletions examples/llm-api/llm_lookahead_decoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
### Generate Text Using Lookahead Decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
LookaheadDecodingConfig, SamplingParams)


Expand Down
4 changes: 2 additions & 2 deletions examples/llm-api/llm_medusa_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import argparse
from pathlib import Path

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
MedusaDecodingConfig, SamplingParams)


Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_multilora.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
### Generate text with multiple LoRA adapters
from huggingface_hub import snapshot_download

from tensorrt_llm import LLM, BuildConfig
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import LoRARequest
from tensorrt_llm.llmapi import BuildConfig
from tensorrt_llm.lora_manager import LoraConfig


Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/llm_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import torch

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig

major, minor = torch.cuda.get_device_capability()
Expand Down
3 changes: 2 additions & 1 deletion examples/llm-api/quickstart_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM


def main():
Expand Down
4 changes: 2 additions & 2 deletions examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
from tqdm import tqdm

import tensorrt_llm
from tensorrt_llm._torch import LLM as TORCH_LLM
from tensorrt_llm import LLM as TORCH_LLM
from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.bindings.executor import DecodingConfig
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
from tensorrt_llm.llmapi import RequestOutput, SamplingParams
from tensorrt_llm.llmapi.llm import LLM as TRT_LLM

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/out_of_tree_example/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import modeling_opt # noqa

from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM


def main():
Expand Down
3 changes: 1 addition & 2 deletions examples/pytorch/quickstart.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams


def main():
Expand Down
3 changes: 1 addition & 2 deletions examples/pytorch/quickstart_advanced.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import argparse

from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
KvCacheConfig, MTPDecodingConfig,
NGramDecodingConfig, TorchCompileConfig)
Expand Down
3 changes: 1 addition & 2 deletions examples/pytorch/star_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

import torch

from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig


Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def _add_trt_llm_dll_directory():
from .disaggregated_params import DisaggregatedParams
from .functional import Tensor, constant
from .llmapi import LLM, LlmArgs
from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
from .logger import logger
from .mapping import Mapping
from .models.automodel import AutoConfig, AutoModelForCausalLM
Expand Down Expand Up @@ -98,6 +99,8 @@ def _add_trt_llm_dll_directory():
'tools',
'LLM',
'LlmArgs',
'TorchLlmArgs',
'TrtLlmArgs',
'SamplingParams',
'DisaggregatedParams',
'KvCacheConfig',
Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/_tensorrt_engine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from tensorrt_llm.llmapi.llm import _TrtLLM as LLM

__all__ = ['LLM']
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .llm import LLM
from .model_config import MoeLoadBalancerConfig

__all__ = ["LLM"]
__all__ = ["LLM", "MoeLoadBalancerConfig"]
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/auto_deploy/shim/demollm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
import torch.multiprocessing as mp
from transformers import PreTrainedTokenizerBase

from ...._tensorrt_engine import LLM
from ....executor import GenerationExecutor
from ....executor.request import GenerationRequest
from ....executor.result import CompletionOutput, GenerationResult
from ....inputs.registry import create_input_processor
from ....llmapi.llm import LLM, RequestOutput
from ....llmapi.llm import RequestOutput
from ....llmapi.llm_args import _AutoDeployLlmArgs
from ....llmapi.tokenizer import TokenizerBase
from ....sampling_params import SamplingParams
Expand Down
12 changes: 11 additions & 1 deletion tensorrt_llm/_torch/llm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
from tensorrt_llm.llmapi.llm import _TorchLLM


class LLM(_TorchLLM):

def __init__(self, *args, **kwargs):
raise ImportError(
"_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
)


# Keep the LLM class to guide the users to use the default LLM class
__all__ = ['LLM']
3 changes: 2 additions & 1 deletion tensorrt_llm/bench/benchmark/low_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
optgroup)

from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode

# isort: off
Expand Down
5 changes: 3 additions & 2 deletions tensorrt_llm/bench/benchmark/throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@
from tensorrt_llm.bench.benchmark.utils.general import (
get_settings_from_engine, get_settings)
# isort: on
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
initialize_tokenizer,
update_metadata_for_multimodal)
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
from tensorrt_llm.logger import logger
from tensorrt_llm.sampling_params import SamplingParams

Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/bench/benchmark/utils/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from zmq import PUSH
from zmq.asyncio import Context

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.dataclasses.general import InferenceRequest
from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
from tensorrt_llm.executor.postproc_worker import PostprocParams
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/bench/build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.llmapi import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.llm_utils import QuantConfig
from tensorrt_llm.logger import logger
from tensorrt_llm.quantization.mode import QuantAlgo
Expand Down
Loading