Skip to content

Commit 68b3346

Browse files
committed
TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile
TRT-LLM installation utilities and adding test cases adding the option in _compiler.py changes in the TRT-LLM loading tool- removing install_wget, install_unzip, install_mpi Further changes in error logging of the TRT-LLM installation tool moving the load_tensorrt_llm to dynamo/utils.py correcting misprint for TRT LLM load Using python lib for download to make it platform agnostic dll file path update for windows correcting the non critical lint error Including version in versions.txt linting error fixes and rebase fix removing Platform enum from converter_utils.py Addressing review comments- tmp dir for wheel download and wheel extraction, variable for py_version checks for windows where NCCL backend is not supported adding checks for windows and jetson devices Keeping the extracted and deleting download file, restructuring test modifying the error warning of missing libmpi libs removing the redundant initializations adding tests in CI correcting the skip test condition install MPI libs for linux x86 adding SBSA to the supported platform of TRT-LLM libs and installing MPI libs for the distributed tests Using python package for platform detection using python platform
1 parent 1f4c159 commit 68b3346

File tree

11 files changed

+385
-164
lines changed

11 files changed

+385
-164
lines changed

.github/workflows/build-test-linux-aarch64.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,41 @@ jobs:
356356
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
357357
popd
358358
359+
tests-py-distributed:
360+
name: Test dynamo distributed [Python]
361+
needs: [filter-matrix, build]
362+
if: false
363+
strategy:
364+
fail-fast: false
365+
matrix:
366+
include:
367+
- repository: pytorch/tensorrt
368+
package-name: torch_tensorrt
369+
pre-script: packaging/pre_build_script.sh
370+
post-script: packaging/post_build_script.sh
371+
smoke-test-script: packaging/smoke_test_script.sh
372+
uses: ./.github/workflows/linux-test.yml
373+
with:
374+
job-name: tests-py-dynamo-distributed
375+
repository: "pytorch/tensorrt"
376+
ref: ""
377+
test-infra-repository: pytorch/test-infra
378+
test-infra-ref: main
379+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
380+
pre-script: ${{ matrix.pre-script }}
381+
script: |
382+
set -euo pipefail
383+
export USE_HOST_DEPS=1
384+
export CI_BUILD=1
385+
export USE_TRTLLM_PLUGINS=1
386+
dnf install -y mpich mpich-devel openmpi openmpi-devel
387+
pushd .
388+
cd tests/py
389+
cd dynamo
390+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
391+
popd
392+
393+
359394
concurrency:
360395
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
361396
cancel-in-progress: true

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,39 @@ jobs:
340340
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
341341
popd
342342
343+
tests-py-distributed:
344+
name: Test dynamo distributed [Python]
345+
needs: [filter-matrix, build]
346+
strategy:
347+
fail-fast: false
348+
matrix:
349+
include:
350+
- repository: pytorch/tensorrt
351+
package-name: torch_tensorrt
352+
pre-script: packaging/pre_build_script.sh
353+
post-script: packaging/post_build_script.sh
354+
smoke-test-script: packaging/smoke_test_script.sh
355+
uses: ./.github/workflows/linux-test.yml
356+
with:
357+
job-name: tests-py-dynamo-distributed
358+
repository: "pytorch/tensorrt"
359+
ref: ""
360+
test-infra-repository: pytorch/test-infra
361+
test-infra-ref: main
362+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
363+
pre-script: ${{ matrix.pre-script }}
364+
script: |
365+
set -euo pipefail
366+
export USE_HOST_DEPS=1
367+
export CI_BUILD=1
368+
export USE_TRTLLM_PLUGINS=1
369+
dnf install -y mpich mpich-devel openmpi openmpi-devel
370+
pushd .
371+
cd tests/py
372+
cd dynamo
373+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
374+
popd
375+
343376
concurrency:
344377
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
345378
cancel-in-progress: true

dev_dep_versions.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
__cuda_version__: "12.8"
22
__tensorrt_version__: "10.12.0"
33
__tensorrt_rtx_version__: "1.0.0"
4+
__tensorrt_llm_version__: "0.17.0.post1"

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
103103
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
104104
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
105105
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
106+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
106107
**kwargs: Any,
107108
) -> torch.fx.GraphModule:
108109
"""Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
176177
enable_weight_streaming (bool): Enable weight streaming.
177178
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
178179
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
180+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
179181
**kwargs: Any,
180182
Returns:
181183
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -330,6 +332,7 @@ def cross_compile_for_windows(
330332
"enable_weight_streaming": enable_weight_streaming,
331333
"tiling_optimization_level": tiling_optimization_level,
332334
"l2_limit_for_tiling": l2_limit_for_tiling,
335+
"use_distributed_mode_trace": use_distributed_mode_trace,
333336
}
334337

335338
# disable the following settings is not supported for cross compilation for windows feature
@@ -430,6 +433,7 @@ def compile(
430433
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
431434
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
432435
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
436+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
433437
**kwargs: Any,
434438
) -> torch.fx.GraphModule:
435439
"""Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -506,6 +510,7 @@ def compile(
506510
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
507511
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
508512
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
513+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
509514
**kwargs: Any,
510515
Returns:
511516
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -674,6 +679,7 @@ def compile(
674679
"tiling_optimization_level": tiling_optimization_level,
675680
"l2_limit_for_tiling": l2_limit_for_tiling,
676681
"offload_module_to_cpu": offload_module_to_cpu,
682+
"use_distributed_mode_trace": use_distributed_mode_trace,
677683
}
678684

679685
settings = CompilationSettings(**compilation_options)
@@ -1045,6 +1051,7 @@ def convert_exported_program_to_serialized_trt_engine(
10451051
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
10461052
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
10471053
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
1054+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
10481055
**kwargs: Any,
10491056
) -> bytes:
10501057
"""Convert an ExportedProgram to a serialized TensorRT engine
@@ -1118,6 +1125,7 @@ def convert_exported_program_to_serialized_trt_engine(
11181125
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
11191126
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
11201127
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
1128+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
11211129
**kwargs: Any,
11221130
Returns:
11231131
bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
@@ -1286,6 +1294,7 @@ def convert_exported_program_to_serialized_trt_engine(
12861294
"tiling_optimization_level": tiling_optimization_level,
12871295
"l2_limit_for_tiling": l2_limit_for_tiling,
12881296
"offload_module_to_cpu": offload_module_to_cpu,
1297+
"use_distributed_mode_trace": use_distributed_mode_trace,
12891298
}
12901299

12911300
settings = CompilationSettings(**compilation_options)

py/torch_tensorrt/dynamo/conversion/converter_utils.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import collections
2-
import ctypes
32
import functools
43
import logging
5-
import os
64
from typing import (
75
Any,
86
Callable,
@@ -1124,69 +1122,6 @@ def args_bounds_check(
11241122
return args[i] if len(args) > i and args[i] is not None else replacement
11251123

11261124

1127-
def load_tensorrt_llm() -> bool:
1128-
"""
1129-
Attempts to load the TensorRT-LLM plugin and initialize it.
1130-
1131-
Returns:
1132-
bool: True if the plugin was successfully loaded and initialized, False otherwise.
1133-
"""
1134-
try:
1135-
import tensorrt_llm as trt_llm # noqa: F401
1136-
1137-
_LOGGER.info("TensorRT-LLM successfully imported")
1138-
return True
1139-
except (ImportError, AssertionError) as e_import_error:
1140-
# Check for environment variable for the plugin library path
1141-
plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
1142-
if not plugin_lib_path:
1143-
_LOGGER.warning(
1144-
"TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
1145-
)
1146-
return False
1147-
1148-
_LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
1149-
try:
1150-
# Load the shared library
1151-
handle = ctypes.CDLL(plugin_lib_path)
1152-
_LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
1153-
except OSError as e_os_error:
1154-
_LOGGER.error(
1155-
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
1156-
f"Ensure the path is correct and the library is compatible",
1157-
exc_info=e_os_error,
1158-
)
1159-
return False
1160-
1161-
try:
1162-
# Configure plugin initialization arguments
1163-
handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
1164-
handle.initTrtLlmPlugins.restype = ctypes.c_bool
1165-
except AttributeError as e_plugin_unavailable:
1166-
_LOGGER.warning(
1167-
"Unable to initialize the TensorRT-LLM plugin library",
1168-
exc_info=e_plugin_unavailable,
1169-
)
1170-
return False
1171-
1172-
try:
1173-
# Initialize the plugin
1174-
TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
1175-
if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
1176-
_LOGGER.info("TensorRT-LLM plugin successfully initialized")
1177-
return True
1178-
else:
1179-
_LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
1180-
return False
1181-
except Exception as e_initialization_error:
1182-
_LOGGER.warning(
1183-
"Exception occurred during TensorRT-LLM plugin library initialization",
1184-
exc_info=e_initialization_error,
1185-
)
1186-
return False
1187-
return False
1188-
1189-
11901125
def promote_trt_tensors_to_same_dtype(
11911126
ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
11921127
) -> tuple[TRTTensor, TRTTensor]:

py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
1212
dynamo_tensorrt_converter,
1313
)
14-
from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
14+
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
15+
tensorrt_fused_nccl_all_gather_op,
16+
tensorrt_fused_nccl_reduce_scatter_op,
17+
)
18+
from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
1519

1620
_LOGGER: logging.Logger = logging.getLogger(__name__)
1721

18-
if load_tensorrt_llm():
19-
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
20-
tensorrt_fused_nccl_all_gather_op,
21-
tensorrt_fused_nccl_reduce_scatter_op,
22-
)
22+
if load_tensorrt_llm_for_nccl():
2323

2424
@dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
2525
def fused_nccl_gather(

0 commit comments

Comments
 (0)