From 952e012cd12d00bbeff4d8ef101bdefc07a30f43 Mon Sep 17 00:00:00 2001 From: Superjomn <328693+Superjomn@users.noreply.github.com> Date: Wed, 18 Jun 2025 07:49:09 +0000 Subject: [PATCH 1/2] make PyT default Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- examples/apps/chat.py | 3 ++- examples/apps/fastapi_server.py | 3 ++- examples/auto_deploy/build_and_run_ad.py | 3 ++- examples/llm-api/llm_auto_parallel.py | 3 ++- examples/llm-api/llm_eagle2_decoding.py | 4 ++-- examples/llm-api/llm_eagle_decoding.py | 6 +++--- examples/llm-api/llm_guided_decoding.py | 3 ++- examples/llm-api/llm_inference.py | 3 ++- examples/llm-api/llm_inference_async.py | 3 ++- examples/llm-api/llm_inference_async_streaming.py | 3 ++- examples/llm-api/llm_inference_customize.py | 3 ++- examples/llm-api/llm_inference_distributed.py | 3 ++- examples/llm-api/llm_inference_kv_events.py | 3 ++- examples/llm-api/llm_logits_processor.py | 2 +- examples/llm-api/llm_lookahead_decoding.py | 4 ++-- examples/llm-api/llm_medusa_decoding.py | 4 ++-- examples/llm-api/llm_multilora.py | 3 ++- examples/llm-api/llm_quantization.py | 3 ++- examples/llm-api/quickstart_example.py | 3 ++- .../llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py | 4 ++-- tensorrt_llm/__init__.py | 3 +++ tensorrt_llm/_tensorrt_engine/__init__.py | 3 +++ tensorrt_llm/_torch/auto_deploy/shim/demollm.py | 3 ++- tensorrt_llm/_torch/llm.py | 12 +++++++++++- tensorrt_llm/bench/benchmark/low_latency.py | 3 ++- tensorrt_llm/bench/benchmark/throughput.py | 5 +++-- tensorrt_llm/bench/benchmark/utils/asynchronous.py | 3 ++- tensorrt_llm/commands/eval.py | 5 +++-- tensorrt_llm/commands/serve.py | 5 +++-- tensorrt_llm/evaluate/cnn_dailymail.py | 5 +++-- tensorrt_llm/evaluate/json_mode_eval.py | 5 +++-- tensorrt_llm/evaluate/lm_eval.py | 5 +++-- tensorrt_llm/evaluate/mmlu.py | 5 +++-- tensorrt_llm/llmapi/__init__.py | 4 +--- tensorrt_llm/llmapi/llm.py | 10 ++++------ tensorrt_llm/scaffolding/worker.py | 2 +- tensorrt_llm/serve/openai_server.py | 2 +- tests/integration/defs/accuracy/accuracy_core.py | 5 +++-- tests/integration/defs/accuracy/test_llm_api.py | 3 ++- .../defs/examples/run_llm_fp8_quant_llama_70b.py | 3 ++- .../defs/examples/run_llm_quickstart_atexit.py | 3 ++- tests/integration/defs/llmapi/_run_llmapi_llm.py | 3 ++- tests/integration/defs/llmapi/test_llm_e2e.py | 2 +- .../unit/singlegpu/shim/test_llm_config.py | 2 +- tests/unittest/api_stability/api_stability_core.py | 3 ++- tests/unittest/api_stability/test_llm_api.py | 3 ++- tests/unittest/llmapi/apps/_test_openai_metrics.py | 2 +- .../unittest/llmapi/apps/_test_openai_multi_chat.py | 2 +- tests/unittest/llmapi/run_llm.py | 3 ++- tests/unittest/llmapi/run_llm_with_postproc.py | 3 ++- tests/unittest/llmapi/test_executor.py | 3 ++- tests/unittest/llmapi/test_llm.py | 3 ++- tests/unittest/llmapi/test_llm_args.py | 2 +- tests/unittest/llmapi/test_llm_download.py | 2 +- tests/unittest/llmapi/test_llm_kv_cache_events.py | 3 ++- tests/unittest/llmapi/test_llm_multi_gpu.py | 3 ++- tests/unittest/llmapi/test_llm_quant.py | 3 ++- .../all_models/llmapi/tensorrt_llm/1/model.py | 4 ++-- 58 files changed, 128 insertions(+), 78 deletions(-) create mode 100644 tensorrt_llm/_tensorrt_engine/__init__.py diff --git a/examples/apps/chat.py b/examples/apps/chat.py index 855443f6f7e..620a3e95b77 100755 --- a/examples/apps/chat.py +++ b/examples/apps/chat.py @@ -5,7 +5,8 @@ import colorama from transformers import AutoTokenizer, PreTrainedTokenizer -from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams class LlmConsole(code.InteractiveConsole): diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index 7f448d4685c..b2aa0baf2ab 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -18,8 +18,9 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import CppExecutorError, RequestError -from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams TIMEOUT_KEEP_ALIVE = 5 # seconds. diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py index 882681d2bf9..e257aec5015 100644 --- a/examples/auto_deploy/build_and_run_ad.py +++ b/examples/auto_deploy/build_and_run_ad.py @@ -7,11 +7,12 @@ import torch from simple_config import SimpleConfig +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry from tensorrt_llm._torch.auto_deploy.shim import DemoLLM from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger -from tensorrt_llm.llmapi.llm import LLM, RequestOutput +from tensorrt_llm.llmapi.llm import RequestOutput from tensorrt_llm.llmapi.llm_args import TorchCompileConfig from tensorrt_llm.sampling_params import SamplingParams diff --git a/examples/llm-api/llm_auto_parallel.py b/examples/llm-api/llm_auto_parallel.py index 5d1ce835127..be496b9bbe9 100644 --- a/examples/llm-api/llm_auto_parallel.py +++ b/examples/llm-api/llm_auto_parallel.py @@ -1,5 +1,6 @@ ### Automatic Parallelism with LLM -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-api/llm_eagle2_decoding.py b/examples/llm-api/llm_eagle2_decoding.py index 2e53a9b88fc..6a84d927092 100755 --- a/examples/llm-api/llm_eagle2_decoding.py +++ b/examples/llm-api/llm_eagle2_decoding.py @@ -1,7 +1,7 @@ ### Generate Text Using Eagle2 Decoding -from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig, +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig, SamplingParams) diff --git a/examples/llm-api/llm_eagle_decoding.py b/examples/llm-api/llm_eagle_decoding.py index 80e3d8f7203..87a113411a0 100644 --- a/examples/llm-api/llm_eagle_decoding.py +++ b/examples/llm-api/llm_eagle_decoding.py @@ -1,8 +1,8 @@ ### Generate Text Using Eagle Decoding -from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig, - SamplingParams) +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig def main(): diff --git a/examples/llm-api/llm_guided_decoding.py b/examples/llm-api/llm_guided_decoding.py index 6719a3d1e4d..1138c63f969 100644 --- a/examples/llm-api/llm_guided_decoding.py +++ b/examples/llm-api/llm_guided_decoding.py @@ -1,5 +1,6 @@ ### Generate text with guided decoding -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import GuidedDecodingParams diff --git a/examples/llm-api/llm_inference.py b/examples/llm-api/llm_inference.py index 7eb71256406..a5335e7a375 100644 --- a/examples/llm-api/llm_inference.py +++ b/examples/llm-api/llm_inference.py @@ -1,7 +1,8 @@ ### Generate text import tempfile -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-api/llm_inference_async.py b/examples/llm-api/llm_inference_async.py index f025d3a1553..f7c85189ada 100644 --- a/examples/llm-api/llm_inference_async.py +++ b/examples/llm-api/llm_inference_async.py @@ -1,7 +1,8 @@ ### Generate Text Asynchronously import asyncio -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-api/llm_inference_async_streaming.py b/examples/llm-api/llm_inference_async_streaming.py index 00de565dbbd..c05a231b26d 100644 --- a/examples/llm-api/llm_inference_async_streaming.py +++ b/examples/llm-api/llm_inference_async_streaming.py @@ -1,7 +1,8 @@ ### Generate Text in Streaming import asyncio -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-api/llm_inference_customize.py b/examples/llm-api/llm_inference_customize.py index d6b17e51ea1..c18dcc4de14 100644 --- a/examples/llm-api/llm_inference_customize.py +++ b/examples/llm-api/llm_inference_customize.py @@ -1,7 +1,8 @@ ### Generate text with customization import tempfile -from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams def main(): diff --git a/examples/llm-api/llm_inference_distributed.py b/examples/llm-api/llm_inference_distributed.py index 5ad9c98117c..cfdccf774ec 100644 --- a/examples/llm-api/llm_inference_distributed.py +++ b/examples/llm-api/llm_inference_distributed.py @@ -1,5 +1,6 @@ ### Distributed LLM Generation -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py index 009b7e0dee7..ff8d36e18cc 100644 --- a/examples/llm-api/llm_inference_kv_events.py +++ b/examples/llm-api/llm_inference_kv_events.py @@ -1,6 +1,7 @@ ### Get KV Cache Events -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import KvCacheConfig diff --git a/examples/llm-api/llm_logits_processor.py b/examples/llm-api/llm_logits_processor.py index 516cd507107..96c81802b19 100644 --- a/examples/llm-api/llm_logits_processor.py +++ b/examples/llm-api/llm_logits_processor.py @@ -3,7 +3,7 @@ import torch -from tensorrt_llm import LLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.sampling_params import (BatchedLogitsProcessor, LogitsProcessor, SamplingParams) diff --git a/examples/llm-api/llm_lookahead_decoding.py b/examples/llm-api/llm_lookahead_decoding.py index cb5c7bb75de..ed2c94450dd 100644 --- a/examples/llm-api/llm_lookahead_decoding.py +++ b/examples/llm-api/llm_lookahead_decoding.py @@ -1,6 +1,6 @@ ### Generate Text Using Lookahead Decoding -from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig, +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig, LookaheadDecodingConfig, SamplingParams) diff --git a/examples/llm-api/llm_medusa_decoding.py b/examples/llm-api/llm_medusa_decoding.py index 6b4ba46434b..e7594613ec4 100644 --- a/examples/llm-api/llm_medusa_decoding.py +++ b/examples/llm-api/llm_medusa_decoding.py @@ -2,8 +2,8 @@ import argparse from pathlib import Path -from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig, +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig, MedusaDecodingConfig, SamplingParams) diff --git a/examples/llm-api/llm_multilora.py b/examples/llm-api/llm_multilora.py index e3cfe0fe142..00eed1fe028 100644 --- a/examples/llm-api/llm_multilora.py +++ b/examples/llm-api/llm_multilora.py @@ -1,8 +1,9 @@ ### Generate text with multiple LoRA adapters from huggingface_hub import snapshot_download -from tensorrt_llm import LLM, BuildConfig +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import LoRARequest +from tensorrt_llm.llmapi import BuildConfig from tensorrt_llm.lora_manager import LoraConfig diff --git a/examples/llm-api/llm_quantization.py b/examples/llm-api/llm_quantization.py index 99dbe8cd60a..24bcfa60e07 100644 --- a/examples/llm-api/llm_quantization.py +++ b/examples/llm-api/llm_quantization.py @@ -3,7 +3,8 @@ import torch -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig major, minor = torch.cuda.get_device_capability() diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py index 8dff6a47049..4c45eec83ed 100644 --- a/examples/llm-api/quickstart_example.py +++ b/examples/llm-api/quickstart_example.py @@ -1,4 +1,5 @@ -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM def main(): diff --git a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py index 715bdd116f0..d593f3380f2 100644 --- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py +++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py @@ -32,12 +32,12 @@ from tqdm import tqdm import tensorrt_llm -from tensorrt_llm._torch import LLM as TORCH_LLM +from tensorrt_llm import LLM as TORCH_LLM +from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.bindings.executor import DecodingConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig from tensorrt_llm.llmapi import RequestOutput, SamplingParams -from tensorrt_llm.llmapi.llm import LLM as TRT_LLM logger = logging.getLogger(__name__) diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index 9c59d5bee25..01589ebc7e3 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -46,6 +46,7 @@ def _add_trt_llm_dll_directory(): from .disaggregated_params import DisaggregatedParams from .functional import Tensor, constant from .llmapi import LLM, LlmArgs +from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs from .logger import logger from .mapping import Mapping from .models.automodel import AutoConfig, AutoModelForCausalLM @@ -98,6 +99,8 @@ def _add_trt_llm_dll_directory(): 'tools', 'LLM', 'LlmArgs', + 'TorchLlmArgs', + 'TrtLlmArgs', 'SamplingParams', 'DisaggregatedParams', 'KvCacheConfig', diff --git a/tensorrt_llm/_tensorrt_engine/__init__.py b/tensorrt_llm/_tensorrt_engine/__init__.py new file mode 100644 index 00000000000..39669a168fd --- /dev/null +++ b/tensorrt_llm/_tensorrt_engine/__init__.py @@ -0,0 +1,3 @@ +from tensorrt_llm.llmapi.llm import _TrtLLM as LLM + +__all__ = ['LLM'] diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py index 791b06761b7..13c9a7374b0 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py @@ -10,11 +10,12 @@ import torch.multiprocessing as mp from transformers import PreTrainedTokenizerBase +from ...._tensorrt_engine import LLM from ....executor import GenerationExecutor from ....executor.request import GenerationRequest from ....executor.result import CompletionOutput, GenerationResult from ....inputs.registry import create_input_processor -from ....llmapi.llm import LLM, RequestOutput +from ....llmapi.llm import RequestOutput from ....llmapi.llm_args import _AutoDeployLlmArgs from ....llmapi.tokenizer import TokenizerBase from ....sampling_params import SamplingParams diff --git a/tensorrt_llm/_torch/llm.py b/tensorrt_llm/_torch/llm.py index 61b4d55b0b2..2fb41570b34 100644 --- a/tensorrt_llm/_torch/llm.py +++ b/tensorrt_llm/_torch/llm.py @@ -1,3 +1,13 @@ -from tensorrt_llm.llmapi.llm import _TorchLLM as LLM +from tensorrt_llm.llmapi.llm import _TorchLLM + +class LLM(_TorchLLM): + + def __init__(self, *args, **kwargs): + raise ImportError( + "_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly" + ) + + +# Keep the LLM class to guide the users to use the default LLM class __all__ = ['LLM'] diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py index f9299a7de36..490ac62f4f5 100644 --- a/tensorrt_llm/bench/benchmark/low_latency.py +++ b/tensorrt_llm/bench/benchmark/low_latency.py @@ -10,13 +10,14 @@ from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup, optgroup) +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment from tensorrt_llm.bench.dataclasses.reporting import ReportUtility -from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy +from tensorrt_llm.llmapi import CapacitySchedulerPolicy from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode # isort: off diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index e63aa25662a..fd9ad5016e0 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -17,7 +17,8 @@ from tensorrt_llm.bench.benchmark.utils.general import ( get_settings_from_engine, get_settings) # isort: on -from tensorrt_llm._torch.llm import LLM as PyTorchLLM +from tensorrt_llm import LLM as PyTorchLLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment @@ -25,7 +26,7 @@ from tensorrt_llm.bench.utils.data import (create_dataset_from_stream, initialize_tokenizer, update_metadata_for_multimodal) -from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy +from tensorrt_llm.llmapi import CapacitySchedulerPolicy from tensorrt_llm.logger import logger from tensorrt_llm.sampling_params import SamplingParams diff --git a/tensorrt_llm/bench/benchmark/utils/asynchronous.py b/tensorrt_llm/bench/benchmark/utils/asynchronous.py index af6ae18ab5a..99fd06e4f39 100644 --- a/tensorrt_llm/bench/benchmark/utils/asynchronous.py +++ b/tensorrt_llm/bench/benchmark/utils/asynchronous.py @@ -9,7 +9,8 @@ from zmq import PUSH from zmq.asyncio import Context -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bench.dataclasses.general import InferenceRequest from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper from tensorrt_llm.executor.postproc_worker import PostprocParams diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py index 0c7e6956c78..eff80d1a69d 100644 --- a/tensorrt_llm/commands/eval.py +++ b/tensorrt_llm/commands/eval.py @@ -18,10 +18,11 @@ import tensorrt_llm.profiler as profiler -from .._torch.llm import LLM as PyTorchLLM +from .. import LLM as PyTorchLLM +from .._tensorrt_engine import LLM from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended, GPQAMain, JsonModeEval) -from ..llmapi import LLM, BuildConfig, KvCacheConfig +from ..llmapi import BuildConfig, KvCacheConfig from ..llmapi.llm_utils import update_llm_args_with_extra_options from ..logger import logger, severity_map diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 6a970f93941..ddbcba2a115 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -11,10 +11,11 @@ from strenum import StrEnum from torch.cuda import device_count -from tensorrt_llm._torch.llm import LLM as PyTorchLLM +from tensorrt_llm import LLM as PyTorchLLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._utils import mpi_rank from tensorrt_llm.executor.utils import LlmLauncherEnvs -from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy, +from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy, DynamicBatchConfig, KvCacheConfig, SchedulerConfig) from tensorrt_llm.llmapi.disagg_utils import (CtxGenServerConfig, diff --git a/tensorrt_llm/evaluate/cnn_dailymail.py b/tensorrt_llm/evaluate/cnn_dailymail.py index e2dfe3056f1..a5bb14eadaa 100644 --- a/tensorrt_llm/evaluate/cnn_dailymail.py +++ b/tensorrt_llm/evaluate/cnn_dailymail.py @@ -18,8 +18,9 @@ import datasets import evaluate -from .._torch import LLM as PyTorchLLM -from ..llmapi import LLM, RequestOutput +from .. import LLM as PyTorchLLM +from .._tensorrt_engine import LLM +from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import SamplingParams from .interface import Evaluator diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py index cd3b8a586ec..69c41699cd1 100644 --- a/tensorrt_llm/evaluate/json_mode_eval.py +++ b/tensorrt_llm/evaluate/json_mode_eval.py @@ -19,8 +19,9 @@ import datasets import numpy as np -from .._torch import LLM as PyTorchLLM -from ..llmapi import LLM, RequestOutput +from .. import LLM as PyTorchLLM +from .._tensorrt_engine import LLM +from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import GuidedDecodingParams, SamplingParams from .interface import Evaluator diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py index 3ec1be52399..bdddbcbb736 100644 --- a/tensorrt_llm/evaluate/lm_eval.py +++ b/tensorrt_llm/evaluate/lm_eval.py @@ -28,8 +28,9 @@ except ImportError: TemplateLM = object -from .._torch import LLM as PyTorchLLM -from ..llmapi import LLM, RequestOutput +from .. import LLM as PyTorchLLM +from .._tensorrt_engine import LLM +from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import SamplingParams from .interface import Evaluator diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py index 0e6df91df09..92d7ae1171a 100644 --- a/tensorrt_llm/evaluate/mmlu.py +++ b/tensorrt_llm/evaluate/mmlu.py @@ -40,8 +40,9 @@ import numpy as np import pandas as pd -from .._torch import LLM as PyTorchLLM -from ..llmapi import LLM, RequestOutput +from .. import LLM as PyTorchLLM +from .._tensorrt_engine import LLM +from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import SamplingParams from .interface import Evaluator diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py index eb906e97668..2fe491db0ab 100644 --- a/tensorrt_llm/llmapi/__init__.py +++ b/tensorrt_llm/llmapi/__init__.py @@ -2,7 +2,7 @@ from ..executor import CompletionOutput, RequestError from ..sampling_params import GuidedDecodingParams, SamplingParams from .build_cache import BuildCacheConfig -from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM +from .llm import LLM, RequestOutput # yapf: disable from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig, CapacitySchedulerPolicy, ContextChunkingPolicy, @@ -50,6 +50,4 @@ 'LlmArgs', 'TorchLlmArgs', 'TrtLlmArgs', - '_TrtLLM', - '_TorchLLM', ] diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 5a8c68643ee..a1f1634f1b5 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -896,7 +896,7 @@ def __init__(self, **kwargs) -class LLM(_TrtLLM): +class LLM(_TorchLLM): def __init__(self, model: Union[str, Path], @@ -915,15 +915,13 @@ def __init__(self, revision, tokenizer_revision, **kwargs) -_LLM_REPR = "TrtLLM" +_LLM_REPR = "TorchLLM" # sphinx will ignore the LLM's docstring if it is not explicitly set LLM.__doc__ = \ f"""LLM class is the main class for running a LLM model. - This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend - and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0. - The default backend is the TensorRT backend. + This class is an alias of {_LLM_REPR}. Parameters: -""" + TRT_LLM_DOCSTRING +""" + TORCH_LLM_DOCSTRING diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py index d9a2cd2086d..69086392648 100644 --- a/tensorrt_llm/scaffolding/worker.py +++ b/tensorrt_llm/scaffolding/worker.py @@ -4,8 +4,8 @@ import openai from transformers import AutoTokenizer +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import GenerationExecutor -from tensorrt_llm.llmapi.llm import LLM from tensorrt_llm.llmapi.llm_args import KvCacheConfig from tensorrt_llm.sampling_params import SamplingParams diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 9223c9ddd7b..edc5b5f6f62 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -14,12 +14,12 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse from transformers import AutoConfig, AutoProcessor +from tensorrt_llm._tensorrt_engine import LLM # yapf: disable from tensorrt_llm.executor import CppExecutorError from tensorrt_llm.executor.postproc_worker import PostprocParams from tensorrt_llm.inputs import prompt_inputs from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template -from tensorrt_llm.llmapi import LLM from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole from tensorrt_llm.llmapi.llm import RequestOutput diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index 93f5b2198df..811bb80a109 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -23,10 +23,11 @@ import yaml import tensorrt_llm.evaluate -from tensorrt_llm._torch import LLM as PyTorchLLM +from tensorrt_llm import LLM as PyTorchLLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.speculative import SpecConfig from tensorrt_llm.builder import BuildConfig -from tensorrt_llm.llmapi import LLM, SamplingParams +from tensorrt_llm.llmapi import SamplingParams from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig from tensorrt_llm.logger import logger from tensorrt_llm.models.modeling_utils import QuantConfig diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index 7b2978d8096..8cdb49cf561 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -14,7 +14,8 @@ # limitations under the License. import pytest -from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo diff --git a/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py b/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py index 5ad05ae5547..fffe95a4a46 100644 --- a/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py +++ b/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py @@ -1,7 +1,8 @@ import os from pathlib import Path -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import QuantAlgo, QuantConfig prompts = [ diff --git a/tests/integration/defs/examples/run_llm_quickstart_atexit.py b/tests/integration/defs/examples/run_llm_quickstart_atexit.py index 6738d9396ee..7f2bc2f9225 100644 --- a/tests/integration/defs/examples/run_llm_quickstart_atexit.py +++ b/tests/integration/defs/examples/run_llm_quickstart_atexit.py @@ -1,7 +1,8 @@ import os from pathlib import Path -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import SamplingParams +from tensorrt_llm._tensorrt_engine import LLM if __name__ == '__main__': prompts = [ diff --git a/tests/integration/defs/llmapi/_run_llmapi_llm.py b/tests/integration/defs/llmapi/_run_llmapi_llm.py index f8d4ae5c8aa..854af24efa7 100644 --- a/tests/integration/defs/llmapi/_run_llmapi_llm.py +++ b/tests/integration/defs/llmapi/_run_llmapi_llm.py @@ -3,7 +3,8 @@ import click -from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import BuildConfig, SamplingParams @click.command() diff --git a/tests/integration/defs/llmapi/test_llm_e2e.py b/tests/integration/defs/llmapi/test_llm_e2e.py index 30d5b71fab4..778b870f4f6 100644 --- a/tests/integration/defs/llmapi/test_llm_e2e.py +++ b/tests/integration/defs/llmapi/test_llm_e2e.py @@ -23,7 +23,7 @@ from defs.conftest import llm_models_root, unittest_path from defs.trt_test_alternative import check_call -from tensorrt_llm.llmapi import LLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi.llm_utils import BuildConfig diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py index 49bd2e23308..a256d1f57a5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py @@ -2,9 +2,9 @@ import pytest +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.auto_deploy.shim.demollm import DemoLLM from tensorrt_llm._torch.auto_deploy.transformations.transform import InferenceOptimizer -from tensorrt_llm.llmapi.llm import LLM from tensorrt_llm.llmapi.llm_args import TorchCompileConfig, _AutoDeployLlmArgs # ================================ diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py index 62b0d06400f..e0cde0e4af4 100644 --- a/tests/unittest/api_stability/api_stability_core.py +++ b/tests/unittest/api_stability/api_stability_core.py @@ -17,9 +17,10 @@ from pydantic import BaseModel import tensorrt_llm +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import GenerationResult from tensorrt_llm.executor.result import TokenLogprobs -from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput, +from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput, GuidedDecodingParams, QuantConfig, RequestOutput, SamplingParams) from tensorrt_llm.llmapi.llm_utils import LlmArgs diff --git a/tests/unittest/api_stability/test_llm_api.py b/tests/unittest/api_stability/test_llm_api.py index 014bd8f13ee..1b8fc1dddf0 100644 --- a/tests/unittest/api_stability/test_llm_api.py +++ b/tests/unittest/api_stability/test_llm_api.py @@ -5,8 +5,9 @@ from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot, MethodSnapshot) +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bindings import executor as tllme -from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput, +from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput, GuidedDecodingParams, QuantConfig, RequestOutput) from tensorrt_llm.sampling_params import (BatchedLogitsProcessor, diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py index 1b075b67565..9d207ae4e9a 100755 --- a/tests/unittest/llmapi/apps/_test_openai_metrics.py +++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py @@ -4,7 +4,7 @@ from fastapi.testclient import TestClient from transformers import AutoTokenizer -from tensorrt_llm._torch.llm import LLM as PyTorchLLM +from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig from tensorrt_llm.serve.openai_server import OpenAIServer diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_chat.py b/tests/unittest/llmapi/apps/_test_openai_multi_chat.py index c5a755687f9..9ed9a654c52 100644 --- a/tests/unittest/llmapi/apps/_test_openai_multi_chat.py +++ b/tests/unittest/llmapi/apps/_test_openai_multi_chat.py @@ -10,8 +10,8 @@ from utils.util import (skip_gpu_memory_less_than_40gb, skip_pre_ada, skip_single_gpu) +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import BuildConfig -from tensorrt_llm.llmapi.llm import LLM from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig from ..test_llm import get_model_path diff --git a/tests/unittest/llmapi/run_llm.py b/tests/unittest/llmapi/run_llm.py index 64efd052d46..be0fa122e60 100644 --- a/tests/unittest/llmapi/run_llm.py +++ b/tests/unittest/llmapi/run_llm.py @@ -4,7 +4,8 @@ import click -from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams @click.command() diff --git a/tests/unittest/llmapi/run_llm_with_postproc.py b/tests/unittest/llmapi/run_llm_with_postproc.py index 0f0cdeebac8..6ee365c952b 100644 --- a/tests/unittest/llmapi/run_llm_with_postproc.py +++ b/tests/unittest/llmapi/run_llm_with_postproc.py @@ -6,9 +6,10 @@ import click +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import GenerationResultBase from tensorrt_llm.executor.postproc_worker import PostprocArgs, PostprocParams -from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams from tensorrt_llm.llmapi.utils import print_colored from tensorrt_llm.serve.openai_protocol import ( ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py index 9dc204ed95b..ecdb6d9ad25 100644 --- a/tests/unittest/llmapi/test_executor.py +++ b/tests/unittest/llmapi/test_executor.py @@ -10,6 +10,7 @@ import torch import zmq +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._utils import mpi_world_size from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (DetokenizedGenerationResultBase, @@ -17,7 +18,7 @@ GenerationResult, GenerationResultBase, PostprocWorker) from tensorrt_llm.executor.ipc import FusedIpcQueue, ZeroMqQueue -from tensorrt_llm.llmapi import LLM, BuildConfig +from tensorrt_llm.llmapi import BuildConfig from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.llmapi.utils import AsyncQueue from tensorrt_llm.sampling_params import SamplingParams diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 003b69223a5..44aeee9f97d 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -25,10 +25,11 @@ import transformers from utils.util import skip_single_gpu +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest, PromptAdapterRequest, RequestError) -from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig, +from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig, KvCacheConfig, KvCacheRetentionConfig, LookaheadDecodingConfig, MedusaDecodingConfig, RequestOutput) diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 83e3b73809c..84da13cdc4b 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -5,8 +5,8 @@ import yaml import tensorrt_llm.bindings.executor as tle +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.llm import LLM as TorchLLM -from tensorrt_llm.llmapi.llm import LLM from tensorrt_llm.llmapi.llm_args import * from tensorrt_llm.llmapi.utils import print_traceback_on_error diff --git a/tests/unittest/llmapi/test_llm_download.py b/tests/unittest/llmapi/test_llm_download.py index a1701758ec4..2157919256e 100644 --- a/tests/unittest/llmapi/test_llm_download.py +++ b/tests/unittest/llmapi/test_llm_download.py @@ -1,4 +1,4 @@ -from tensorrt_llm.llmapi import LLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi.utils import (download_hf_model, download_hf_pretrained_config) diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py index b445bd1990b..bdc09323ae3 100644 --- a/tests/unittest/llmapi/test_llm_kv_cache_events.py +++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py @@ -2,10 +2,11 @@ import time import tensorrt_llm +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import KVCacheEventSerializer -from tensorrt_llm.llmapi import LLM, KvCacheConfig +from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.sampling_params import SamplingParams diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index b6c70cb5c7b..b0a6c4bc697 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -8,8 +8,9 @@ import pytest from parameterized import parameterized +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.executor import GenerationExecutorProxy -from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import PretrainedConfig diff --git a/tests/unittest/llmapi/test_llm_quant.py b/tests/unittest/llmapi/test_llm_quant.py index 403bfb7479e..57894da10d1 100644 --- a/tests/unittest/llmapi/test_llm_quant.py +++ b/tests/unittest/llmapi/test_llm_quant.py @@ -1,6 +1,7 @@ import pytest -from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig # isort: off diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py index f8ad09e8b91..b5109c8310c 100755 --- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py +++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py @@ -41,10 +41,10 @@ from mpi4py.futures import MPICommExecutor from mpi4py.MPI import COMM_WORLD +from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm import SamplingParams -from tensorrt_llm._torch.llm import LLM as PyTorchLLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._utils import global_mpi_rank, global_mpi_size -from tensorrt_llm.llmapi import LLM from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict From 302d4b1d784c73be49bcd3a8a8263f99e39344bf Mon Sep 17 00:00:00 2001 From: Superjomn <328693+Superjomn@users.noreply.github.com> Date: Wed, 18 Jun 2025 14:41:42 +0000 Subject: [PATCH 2/2] fix Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- docs/source/torch.md | 6 +- docs/source/torch/adding_new_model.md | 2 +- docs/source/torch/arch_overview.md | 4 +- examples/pytorch/out_of_tree_example/main.py | 2 +- examples/pytorch/quickstart.py | 3 +- examples/pytorch/quickstart_advanced.py | 3 +- examples/pytorch/star_attention.py | 3 +- tensorrt_llm/_torch/__init__.py | 3 +- tensorrt_llm/bench/build/build.py | 2 +- tensorrt_llm/llmapi/llm.py | 24 +++ tensorrt_llm/llmapi/llm_args.py | 10 +- .../accuracy/test_disaggregated_serving.py | 1 - .../defs/accuracy/test_llm_api_pytorch.py | 2 +- .../test_disaggregated_single_gpu.py | 3 +- tests/integration/defs/test_e2e.py | 3 +- .../unit/singlegpu/shim/test_llm_config.py | 3 +- .../_torch/modeling/test_modeling_deepseek.py | 3 +- .../modeling/test_modeling_nemotron_h.py | 2 +- .../modeling/test_modeling_out_of_tree.py | 2 +- .../_torch/multi_gpu/test_star_attention.py | 3 +- .../multi_gpu_modeling/test_deepseek.py | 3 +- .../_torch/multi_gpu_modeling/test_llama4.py | 3 +- .../_torch/speculative/test_draft_target.py | 3 +- .../_torch/speculative/test_eagle3.py | 3 +- .../unittest/_torch/speculative/test_ngram.py | 3 +- .../unittest/_torch/test_overlap_scheduler.py | 3 +- tests/unittest/_torch/test_return_logits.py | 3 +- tests/unittest/_torch/test_trtllm_sampler.py | 3 +- .../api_stability/api_stability_core.py | 10 +- .../api_stability/references/llm.yaml | 159 +++++++++--------- .../references_committed/llm.yaml | 11 +- tests/unittest/api_stability/test_llm_api.py | 20 ++- .../apps/_test_openai_consistent_chat.py | 2 +- tests/unittest/llmapi/run_llm.py | 2 +- tests/unittest/llmapi/test_llm.py | 12 +- tests/unittest/llmapi/test_llm_args.py | 10 +- tests/unittest/llmapi/test_llm_pytorch.py | 24 +-- tests/unittest/llmapi/test_llm_utils.py | 11 +- 38 files changed, 194 insertions(+), 175 deletions(-) diff --git a/docs/source/torch.md b/docs/source/torch.md index 99305f638d4..da59e90d88c 100644 --- a/docs/source/torch.md +++ b/docs/source/torch.md @@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You ## Quick Start -Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model. +Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model. ```{literalinclude} ../../examples/pytorch/quickstart.py :language: python @@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). ```python -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8') llm.generate("Hello, my name is") ``` @@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen: ```python -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8', enable_trtllm_sampler=True) sampling_params = SamplingParams( diff --git a/docs/source/torch/adding_new_model.md b/docs/source/torch/adding_new_model.md index 4ce5988c99c..63217241e73 100644 --- a/docs/source/torch/adding_new_model.md +++ b/docs/source/torch/adding_new_model.md @@ -186,7 +186,7 @@ __all__ = [ Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script: ```python -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM import modeling_mymodel def main(): diff --git a/docs/source/torch/arch_overview.md b/docs/source/torch/arch_overview.md index f48403d2d85..11b12781cea 100644 --- a/docs/source/torch/arch_overview.md +++ b/docs/source/torch/arch_overview.md @@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d ## Top Level API -The interface for PyTorch backend is `tensorrt._torch.LLM`. +The interface for PyTorch backend is `tensorrt_llm.LLM`. ```python -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM llm = LLM(model=) ``` diff --git a/examples/pytorch/out_of_tree_example/main.py b/examples/pytorch/out_of_tree_example/main.py index 430bed126f3..afa943c3422 100644 --- a/examples/pytorch/out_of_tree_example/main.py +++ b/examples/pytorch/out_of_tree_example/main.py @@ -1,6 +1,6 @@ import modeling_opt # noqa -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM def main(): diff --git a/examples/pytorch/quickstart.py b/examples/pytorch/quickstart.py index 9c81a965e5a..b4f313ff192 100644 --- a/examples/pytorch/quickstart.py +++ b/examples/pytorch/quickstart.py @@ -1,5 +1,4 @@ -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams def main(): diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py index 6e755ba8a3b..29c9bb10186 100644 --- a/examples/pytorch/quickstart_advanced.py +++ b/examples/pytorch/quickstart_advanced.py @@ -1,7 +1,6 @@ import argparse -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig, KvCacheConfig, MTPDecodingConfig, NGramDecodingConfig, TorchCompileConfig) diff --git a/examples/pytorch/star_attention.py b/examples/pytorch/star_attention.py index fe861ad4f46..e6071054fe4 100644 --- a/examples/pytorch/star_attention.py +++ b/examples/pytorch/star_attention.py @@ -6,8 +6,7 @@ import torch -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig diff --git a/tensorrt_llm/_torch/__init__.py b/tensorrt_llm/_torch/__init__.py index 7c2d021b1c4..7d2de6d643c 100644 --- a/tensorrt_llm/_torch/__init__.py +++ b/tensorrt_llm/_torch/__init__.py @@ -1,3 +1,4 @@ from .llm import LLM +from .model_config import MoeLoadBalancerConfig -__all__ = ["LLM"] +__all__ = ["LLM", "MoeLoadBalancerConfig"] diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py index e3bd6cbef56..1ea0add7ab1 100644 --- a/tensorrt_llm/bench/build/build.py +++ b/tensorrt_llm/bench/build/build.py @@ -9,7 +9,7 @@ from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS from tensorrt_llm.builder import BuildConfig -from tensorrt_llm.llmapi import LLM +from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi.llm_utils import QuantConfig from tensorrt_llm.logger import logger from tensorrt_llm.quantization.mode import QuantAlgo diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index a1f1634f1b5..5635d4016f6 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -97,6 +97,7 @@ def _repr_fields(self): Attributes: tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any. + llm_id (str): The unique ID of the LLM instance. """ @@ -883,6 +884,9 @@ def __init__(self, # TODO: deprecate backend in LLM kwargs kwargs.pop("backend", None) + # Validate that users don't pass TrtLlmArgs-specific arguments + self._validate_args_for_torch_backend(kwargs) + super().__init__(model, tokenizer, tokenizer_mode, @@ -895,6 +899,26 @@ def __init__(self, backend='pytorch', **kwargs) + def _validate_args_for_torch_backend(self, kwargs: dict) -> None: + """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend. + """ + trtllm_fields = set(TrtLlmArgs.model_fields.keys()) + torchllm_fields = set(TorchLlmArgs.model_fields.keys()) + + trtllm_specific_fields = trtllm_fields - torchllm_fields + + # Check if any TrtLlmArgs-specific arguments are passed + trtllm_specific_args = [] + for key in kwargs: + if key in trtllm_specific_fields: + trtllm_specific_args.append(key) + + if trtllm_specific_args: + raise ValueError( + f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n" + f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend." + ) + class LLM(_TorchLLM): diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index dc74ad95ea3..228be0aed67 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1591,9 +1591,6 @@ def validate_enable_build_cache(self): return self -LlmArgs = TrtLlmArgs - - class LoadFormat(Enum): AUTO = 0 # Initialize all weights randomly. @@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs): moe_load_balancer: Optional[Union[object, str]] = Field( default=None, description="Configuration for MoE load balancing.", - json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"}) + json_schema_extra={ + "type": + "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]" + }) attn_backend: str = Field(default='TRTLLM', description="Attention backend to use.") @@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind: return model_format +LlmArgs = TorchLlmArgs + TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs, indent=' ' * 4) TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs, diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 30d5d55b325..ab5481913f4 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], yaml.dump(gen_server_config, f) args = LlmArgs.from_kwargs(model=model_name, - backend="pytorch", tensor_parallel_size=tensor_parallel_size) trtllm_serve_path = "trtllm-serve" diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 628ca46a140..e9fa70cb338 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -14,7 +14,7 @@ # limitations under the License. import pytest -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig, MTPDecodingConfig, NGramDecodingConfig, diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index 6d8651cfae7..71e5744fdf4 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -9,8 +9,7 @@ from mpi4py import MPI from mpi4py.futures import MPIPoolExecutor -from tensorrt_llm import DisaggregatedParams, SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams from tensorrt_llm._utils import set_mpi_comm from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index fe314ee4beb..22f6035d956 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path, import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer - from tensorrt_llm import SamplingParams - from tensorrt_llm._torch import LLM + from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.sampling_params import SamplingParams prompts = [ "Hello, my name is", diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py index a256d1f57a5..0833f145dc1 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py @@ -128,7 +128,8 @@ def test_config_flow( # Create instance with appropriate mocking with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()): - instance = api_class(**config_params) + with patch.object(api_class, "_build_model", return_value=MagicMock()): + instance = api_class(**config_params) # Verify args were created correctly assert hasattr(instance, "args") diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py index d18ca41de0f..ae6907f30ca 100644 --- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py +++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py @@ -7,8 +7,7 @@ from utils.llm_data import llm_models_root from utils.util import getSMVersion -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py index 5f3ccce3c7b..dd89a7dd0e9 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @@ -2,7 +2,7 @@ from utils.llm_data import llm_models_root from utils.util import skip_gpu_memory_less_than -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.llm import RequestOutput from tensorrt_llm.sampling_params import SamplingParams diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py index eb1bc220f12..f6dcfdf39b3 100644 --- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py +++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py @@ -2,7 +2,7 @@ from parameterized import parameterized -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.sampling_params import SamplingParams diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py index 7c387cd5ccf..9cf12a2c281 100644 --- a/tests/unittest/_torch/multi_gpu/test_star_attention.py +++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py @@ -5,8 +5,7 @@ import torch from utils.llm_data import llm_models_root -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index b73655719fa..678f91880b7 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -7,8 +7,7 @@ from utils.llm_data import llm_models_root from utils.util import getSMVersion -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py index 707e1936ac8..017452a7d63 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py @@ -4,8 +4,7 @@ import torch from utils.llm_data import llm_models_root -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py index 6802457f589..4c230a431c9 100644 --- a/tests/unittest/_torch/speculative/test_draft_target.py +++ b/tests/unittest/_torch/speculative/test_draft_target.py @@ -5,8 +5,7 @@ import pytest import torch -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index 48540c708f7..d0d4c424ed1 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -5,8 +5,7 @@ import pytest import torch -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py index 6db927454b8..0e3e227d7fd 100644 --- a/tests/unittest/_torch/speculative/test_ngram.py +++ b/tests/unittest/_torch/speculative/test_ngram.py @@ -5,8 +5,7 @@ import pytest import torch -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py index 18622f94cbd..be105e96e94 100644 --- a/tests/unittest/_torch/test_overlap_scheduler.py +++ b/tests/unittest/_torch/test_overlap_scheduler.py @@ -4,8 +4,7 @@ import pytest from utils.llm_data import llm_models_root -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py index 2fa21ad4179..a9e0b1a430f 100644 --- a/tests/unittest/_torch/test_return_logits.py +++ b/tests/unittest/_torch/test_return_logits.py @@ -5,8 +5,7 @@ from utils.llm_data import llm_models_root from utils.util import force_ampere -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig prompts = ["A B C"] diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py index bee47efddaf..c4493c266c8 100644 --- a/tests/unittest/_torch/test_trtllm_sampler.py +++ b/tests/unittest/_torch/test_trtllm_sampler.py @@ -5,8 +5,7 @@ from utils.llm_data import llm_models_root from utils.util import similar -from tensorrt_llm import SamplingParams -from tensorrt_llm._torch import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py index e0cde0e4af4..1014f1a22fa 100644 --- a/tests/unittest/api_stability/api_stability_core.py +++ b/tests/unittest/api_stability/api_stability_core.py @@ -17,7 +17,7 @@ from pydantic import BaseModel import tensorrt_llm -from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm import LLM from tensorrt_llm.executor import GenerationResult from tensorrt_llm.executor.result import TokenLogprobs from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput, @@ -367,8 +367,14 @@ def assert_equal(self, other: 'ClassSnapshot'): if self.properties.keys() != other.properties.keys(): diff_keys = set(self.properties.keys()) ^ set( other.properties.keys()) + this_diff_keys = set(self.properties.keys()) - set( + other.properties.keys()) + other_diff_keys = set(other.properties.keys()) - set( + self.properties.keys()) raise AssertionError( - f"{qual_name} has different properties: {diff_keys}") + f"{qual_name} has different properties: {diff_keys}\n" + f"This class has extra properties: {this_diff_keys}\n" + f"The reference has extra properties: {other_diff_keys}") for name, prop in self.properties.items(): with StackTrace().push(name): diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index a388a141532..801f624c1ce 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -1,90 +1,45 @@ methods: __init__: parameters: - # Parallelism + # Parallelism + gpus_per_node: + annotation: Optional[int] + default: null + moe_cluster_parallel_size: + annotation: Optional[int] + default: null + enable_attention_dp: + annotation: bool + default: False cp_config: annotation: Optional[dict] default: null - auto_parallel: - annotation: bool - default: false - auto_parallel_world_size: + # Stats + iter_stats_max_iterations: annotation: Optional[int] default: null - embedding_parallel_mode: - annotation: str - default: SHARDING_ALONG_VOCAB - moe_cluster_parallel_size: + request_stats_max_iterations: annotation: Optional[int] default: null - # Engine building - build_config: - annotation: Optional[tensorrt_llm.builder.BuildConfig] - default: null - enable_build_cache: - annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool] - default: false - fast_build: - annotation: bool - default: false # Bindings and mirrored configs - batching_type: - annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType] - default: null peft_cache_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig] default: null scheduler_config: annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig default: null - extended_runtime_perf_knob_config: - annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig] - default: null - decoding_config: - annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig] - default: null cache_transceiver_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig] default: null - # Misc - backend: - annotation: Optional[str] + batching_type: + annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType] default: null - enable_attention_dp: - annotation: bool - default: false normalize_log_probs: annotation: bool - default: false + default: False gather_generation_logits: annotation: bool - default: false - gpus_per_node: - annotation: Optional[int] - default: null - iter_stats_max_iterations: - annotation: Optional[int] - default: null - request_stats_max_iterations: - annotation: Optional[int] - default: null - workspace: - annotation: Optional[str] - default: null - # LoRA - max_lora_rank: - annotation: Optional[int] - default: null - max_loras: - annotation: int - default: 4 - max_cpu_loras: - annotation: int - default: 4 - allreduce_strategy: - annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']] - default: AUTO - # postproc worker + default: False num_postprocess_workers: annotation: int default: 0 @@ -98,10 +53,73 @@ methods: reasoning_parser: annotation: Optional[str] default: null - # kwargs - kwargs: - annotation: Any - default: inspect._empty + garbage_collection_gen0_threshold: + annotation: int + default: 20000 + # Misc + backend: + annotation: Optional[str] + default: null + build_config: + annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig] + default: null + use_cuda_graph: + annotation: bool + default: False + cuda_graph_batch_sizes: + annotation: Optional[List[int]] + default: null + cuda_graph_max_batch_size: + annotation: int + default: 0 + cuda_graph_padding_enabled: + annotation: bool + default: False + disable_overlap_scheduler: + annotation: bool + default: False + moe_max_num_tokens: + annotation: Optional[int] + default: null + moe_load_balancer: + annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None] + default: null + attn_backend: + annotation: str + default: TRTLLM + moe_backend: + annotation: str + default: CUTLASS + mixed_sampler: + annotation: bool + default: False + enable_trtllm_sampler: + annotation: bool + default: False + kv_cache_dtype: + annotation: str + default: auto + enable_iter_perf_stats: + annotation: bool + default: False + enable_iter_req_stats: + annotation: bool + default: False + print_iter_log: + annotation: bool + default: False + torch_compile_config: + annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig] + default: null + autotuner_enabled: + annotation: bool + default: True + enable_layerwise_nvtx_marker: + annotation: bool + default: False + enable_min_latency: + annotation: bool + default: False return_annotation: None generate: parameters: @@ -145,19 +163,10 @@ methods: annotation: Optional[float] default: 2 return_annotation: tensorrt_llm.executor.result.IterationResult - save: - parameters: - engine_dir: - annotation: str - default: inspect._empty - return_annotation: None shutdown: parameters: {} return_annotation: None properties: - workspace: - annotation: pathlib.Path - default: inspect._empty llm_id: annotation: str default: inspect._empty diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml index e74e6b8d840..a30e62645fe 100644 --- a/tests/unittest/api_stability/references_committed/llm.yaml +++ b/tests/unittest/api_stability/references_committed/llm.yaml @@ -95,8 +95,8 @@ methods: default: null # Misc load_format: - annotation: Literal['auto', 'dummy'] - default: auto + annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat] + default: 0 enable_tqdm: annotation: bool default: false @@ -106,9 +106,10 @@ methods: kv_cache_config: annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig default: null - garbage_collection_gen0_threshold: - annotation: int - default: 20000 + + kwargs: + annotation: Any + default: inspect._empty return_annotation: None generate: parameters: diff --git a/tests/unittest/api_stability/test_llm_api.py b/tests/unittest/api_stability/test_llm_api.py index 1b8fc1dddf0..6960f993286 100644 --- a/tests/unittest/api_stability/test_llm_api.py +++ b/tests/unittest/api_stability/test_llm_api.py @@ -5,8 +5,9 @@ from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot, MethodSnapshot) -from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm import LLM from tensorrt_llm.bindings import executor as tllme +from tensorrt_llm.executor.result import IterationResult from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput, GuidedDecodingParams, QuantConfig, RequestOutput) @@ -131,21 +132,28 @@ def test_new_method(self, mocker): def test_modified_method_with_same_signature(self, mocker): - def new_save(self, engine_dir: str) -> None: + def new_get_stats_async(self, + timeout: Optional[float] = 2 + ) -> IterationResult: pass - new_save.__doc__ = self.TEST_CLASS.save.__doc__ + new_get_stats_async.__doc__ = self.TEST_CLASS.get_stats_async.__doc__ - mocker.patch.object(self.TEST_CLASS, "save", new=new_save) + mocker.patch.object(self.TEST_CLASS, + "get_stats_async", + new=new_get_stats_async) self.test_signature() self.test_docstring() def test_modified_method_with_modified_signature(self, mocker): - def new_save(self, engine_dir: Optional[str]) -> None: + def new_get_stats_async(self, + timeout: Optional[int] = 2) -> IterationResult: pass - mocker.patch.object(self.TEST_CLASS, "save", new=new_save) + mocker.patch.object(self.TEST_CLASS, + "get_stats_async", + new=new_get_stats_async) with pytest.raises(AssertionError): self.test_signature() with pytest.raises(AssertionError): diff --git a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py index 6c4ed3f1efd..a3e716cd40e 100644 --- a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py +++ b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py @@ -9,8 +9,8 @@ from utils.util import (skip_gpu_memory_less_than_40gb, skip_num_gpus_less_than, skip_nvlink_inactive) +from tensorrt_llm import LLM from tensorrt_llm.llmapi import BuildConfig -from tensorrt_llm.llmapi.llm import LLM from ..test_llm import get_model_path from .openai_server import RemoteOpenAIServer diff --git a/tests/unittest/llmapi/run_llm.py b/tests/unittest/llmapi/run_llm.py index be0fa122e60..5b864360b3f 100644 --- a/tests/unittest/llmapi/run_llm.py +++ b/tests/unittest/llmapi/run_llm.py @@ -4,6 +4,7 @@ import click +from tensorrt_llm import LLM as TorchLLM from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams @@ -21,7 +22,6 @@ def main(model_dir: str, tp_size: int, engine_dir: Optional[str], n: int, best_of: Optional[int], top_k: int, use_beam_search: bool, use_pytorch: bool): if use_pytorch: - from tensorrt_llm._torch.llm import LLM as TorchLLM llm = TorchLLM( model_dir, tensor_parallel_size=tp_size, diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 44aeee9f97d..33a458a1234 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -25,6 +25,7 @@ import transformers from utils.util import skip_single_gpu +from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest, @@ -33,9 +34,10 @@ KvCacheConfig, KvCacheRetentionConfig, LookaheadDecodingConfig, MedusaDecodingConfig, RequestOutput) +from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs from tensorrt_llm.llmapi.llm_args import DynamicBatchConfig, SchedulerConfig -from tensorrt_llm.llmapi.llm_utils import (BuildConfig, LlmArgs, QuantAlgo, - QuantConfig, _ParallelConfig) +from tensorrt_llm.llmapi.llm_utils import (BuildConfig, QuantAlgo, QuantConfig, + _ParallelConfig) from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.lora_manager import LoraConfig @@ -119,7 +121,6 @@ def llm_test_harness(model_dir: str, tokenizer = model_dir if backend == "pytorch": - from tensorrt_llm._torch import LLM as LLM_torch llm = LLM_torch(model_dir, tokenizer=tokenizer, **llm_kwargs) else: llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs) @@ -1597,7 +1598,6 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int], LLM_CLASS = LLM llm_args_extra = {} if backend in ["pytorch", "autodeploy"]: - from tensorrt_llm._torch import LLM as LLM_torch LLM_CLASS = LLM_torch else: llm_args_extra["fast_build"] = True @@ -1840,7 +1840,6 @@ def llm_get_stats_test_harness(tp_size: int = 1, sampling_args_extra["return_context_logits"] = True if pytorch_backend: - from tensorrt_llm._torch import LLM as LLM_torch llm_args_extra.update( dict(enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, @@ -1895,8 +1894,6 @@ def test_llm_get_queued_stats(): llm_args_extra = {} sampling_args_extra = {} - from tensorrt_llm._torch import LLM as LLM_torch - llm_args_extra.update( dict(enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, @@ -1968,7 +1965,6 @@ def llm_get_stats_async_test_harness(tp_size: int = 1, sampling_args_extra["return_context_logits"] = True if pytorch_backend: - from tensorrt_llm._torch import LLM as LLM_torch llm_args_extra.update( dict(enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 84da13cdc4b..8406dfc8f31 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -5,8 +5,8 @@ import yaml import tensorrt_llm.bindings.executor as tle +from tensorrt_llm import LLM as TorchLLM from tensorrt_llm._tensorrt_engine import LLM -from tensorrt_llm._torch.llm import LLM as TorchLLM from tensorrt_llm.llmapi.llm_args import * from tensorrt_llm.llmapi.utils import print_traceback_on_error @@ -54,10 +54,10 @@ def test_update_llm_args_with_extra_dict_with_speculative_config(): f.seek(0) dict_content = yaml.safe_load(f) - llm_args = LlmArgs(model=llama_model_path) + llm_args = TrtLlmArgs(model=llama_model_path) llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(), dict_content) - llm_args = LlmArgs(**llm_args_dict) + llm_args = TrtLlmArgs(**llm_args_dict) assert llm_args.speculative_config.max_window_size == 4 assert llm_args.speculative_config.max_ngram_size == 3 assert llm_args.speculative_config.max_verification_set_size == 4 @@ -226,10 +226,10 @@ class TestTrtLlmArgs: def test_dynamic_setattr(self): with pytest.raises(pydantic_core._pydantic_core.ValidationError): - args = LlmArgs(model=llama_model_path, invalid_arg=1) + args = TrtLlmArgs(model=llama_model_path, invalid_arg=1) with pytest.raises(ValueError): - args = LlmArgs(model=llama_model_path) + args = TrtLlmArgs(model=llama_model_path) args.invalid_arg = 1 diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 65f3d16ac69..b5f32547ce1 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -1,5 +1,6 @@ import pytest +from tensorrt_llm import LLM from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.sampling_params import SamplingParams @@ -71,9 +72,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap, SamplingParams() # pytorch only supports n=1 ]) def test_llm_abort_request(sampling_params): - from tensorrt_llm._torch import LLM as LLM_torch - llm = LLM_torch(model=llama_model_path, - kv_cache_config=global_kvcache_config) + llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) run_llm_abort_request(llm=llm, sampling_params=sampling_params) @@ -82,10 +81,9 @@ def test_llm_reward_model(): tokenizer = TransformersTokenizer.from_pretrained(rm_model_path) tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"] - from tensorrt_llm._torch import LLM as LLM_torch - llm = LLM_torch(model=rm_model_path, - attn_backend="VANILLA", - disable_overlap_scheduler=True) + llm = LLM(model=rm_model_path, + attn_backend="VANILLA", + disable_overlap_scheduler=True) sampling_params = SamplingParams(return_context_logits=True) @@ -106,8 +104,6 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming): def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None: - from tensorrt_llm._torch.llm import LLM - lora_config = LoraConfig(lora_dir=[ f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b" ], @@ -134,8 +130,6 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None: def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None: - from tensorrt_llm._torch.llm import LLM - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" @@ -181,8 +175,6 @@ def test_llama_v2_13b_lora(): @skip_gpu_memory_less_than_40gb def test_llama_7b_lora_default_modules() -> None: - from tensorrt_llm._torch.llm import LLM - lora_config = LoraConfig(max_lora_rank=64) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" @@ -214,8 +206,6 @@ def test_llama_7b_multi_lora(): # https://jirasw.nvidia.com/browse/TRTLLM-5045 @skip_gpu_memory_less_than_138gb def test_nemotron_nas_lora() -> None: - from tensorrt_llm._torch.llm import LLM - lora_config = LoraConfig(lora_dir=[ f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64" ], @@ -248,8 +238,6 @@ def test_nemotron_nas_lora() -> None: @skip_gpu_memory_less_than_80gb def test_codellama_fp8_with_bf16_lora() -> None: - from tensorrt_llm._torch.llm import LLM - model_dir = f"{llm_models_root()}/codellama/CodeLlama-7b-Instruct-hf/" quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @@ -308,8 +296,6 @@ def test_codellama_fp8_with_bf16_lora() -> None: @skip_gpu_memory_less_than_80gb def test_bielik_11b_v2_2_instruct_multi_lora() -> None: - from tensorrt_llm._torch.llm import LLM - model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct" target_modules = ['attn_q', 'attn_k', 'attn_v'] diff --git a/tests/unittest/llmapi/test_llm_utils.py b/tests/unittest/llmapi/test_llm_utils.py index ac50a8fbf03..7caa16d7001 100644 --- a/tests/unittest/llmapi/test_llm_utils.py +++ b/tests/unittest/llmapi/test_llm_utils.py @@ -37,9 +37,10 @@ def build_engine(): def test_CachedModelLoader(): # CachedModelLoader enables engine caching and multi-gpu building - args = LlmArgs(model=llama_model_path, - kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), - enable_build_cache=True) + args = TrtLlmArgs( + model=llama_model_path, + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), + enable_build_cache=True) stats = LlmBuildStats() model_loader = CachedModelLoader(args, llm_build_stats=stats) engine_dir, _ = model_loader() @@ -51,9 +52,9 @@ def test_CachedModelLoader(): def test_LlmArgs_default_gpus_per_node(): # default - llm_args = LlmArgs(model=llama_model_path) + llm_args = TrtLlmArgs(model=llama_model_path) assert llm_args.gpus_per_node == torch.cuda.device_count() # set explicitly - llm_args = LlmArgs(model=llama_model_path, gpus_per_node=6) + llm_args = TrtLlmArgs(model=llama_model_path, gpus_per_node=6) assert llm_args.gpus_per_node == 6