NVIDIA · MartinMarciniszyn · Jun 27, 2025 · Jun 27, 2025
@@ -5,12 +5,19 @@ BASE_TAG           ?= $(shell grep '^ARG BASE_TAG=' Dockerfile.multi | grep -o '
 IMAGE_NAME         ?= tensorrt_llm
 IMAGE_TAG          ?= latest
 
+# Used to share .cache when LOCAL_USER=1. Possibility of override is
+# helpful, e.g., for use with Docker rootless mode.
+HOME_DIR           ?= $(HOME)
+
 # Local user information
 USER_ID            ?= $(shell id --user)
 USER_NAME          ?= $(shell id --user --name)
 GROUP_ID           ?= $(shell id --group)
 GROUP_NAME         ?= $(shell id --group --name)
 
+# Try to detect Docker rootless mode
+IS_ROOTLESS        ?= $(shell if [ "$$(docker context inspect --format '{{.Endpoints.docker.Host}}' "$$(docker context show)")" = "unix:///run/user/$(USER_ID)/docker.sock" ]; then echo 1; else echo 0; fi)
+
 # Set this to 1 to add the current user to the docker image and run the container with the user
 LOCAL_USER         ?= 0
 ifeq ($(LOCAL_USER),1)
@@ -108,7 +115,7 @@ endef
 	@echo "Pulling docker image: $(IMAGE_WITH_TAG)"
 	docker pull $(IMAGE_WITH_TAG)
 
-DOCKER_RUN_OPTS   ?= --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864
+DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
 DOCKER_RUN_ARGS   ?=
 # Check if NVIDIA_VISIBLE_DEVICES is set and not empty
 NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
@@ -129,6 +136,9 @@ WORK_DIR          ?= $(CODE_DIR)
 DOCKER_PULL       ?= 0
 
 %_run:
+ifeq ($(IS_ROOTLESS),1)
+	@echo "Assuming Docker rootless mode."
+endif
 ifeq ($(DOCKER_PULL),1)
 	@$(MAKE) --no-print-directory $*_pull
 endif
@@ -138,7 +148,7 @@ endif
 	docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
     		$(GPU_OPTS) \
     		--volume $(SOURCE_DIR):$(CODE_DIR) \
-    		$(if $(filter 1,$(LOCAL_USER)),--volume ${HOME}/.cache:/home/${USER_NAME}/.cache:rw) \
+    		$(if $(filter 1,$(LOCAL_USER)),--volume ${HOME_DIR}/.cache:/home/${USER_NAME}/.cache:rw) \
     		--env "CCACHE_DIR=${CCACHE_DIR}" \
     		--env "CCACHE_BASEDIR=${CODE_DIR}" \
     	    --env "CONAN_HOME=${CONAN_DIR}" \
@@ -215,7 +225,7 @@ ngc-release_%: IMAGE_TAG = $(TRT_LLM_VERSION)-$(PLATFORM)
 
 ngc-release_run:  WORK_DIR = /app/tensorrt_llm
 ngc-release_run:  IMAGE_NAME = $(NGC_AUTO_REPO)
-ngc-release_run:  IMAGE_TAG = $(TRT_LLM_VERSION)
+ngc-release_run: IMAGE_TAG = $(TRT_LLM_VERSION)
 ngc-release_pull: IMAGE_NAME = $(NGC_AUTO_REPO)
 ngc-release_pull: IMAGE_TAG = $(TRT_LLM_VERSION)
 

@@ -44,12 +44,17 @@ Containers can be started with the local user instead of `root` by appending `LO
 make -C docker devel_run LOCAL_USER=1
 ```
 
-Specific CUDA architectures supported by the `wheel` can be specified WITH `CUDA_ARCHS`:
+Specific CUDA architectures supported by the `wheel` can be specified with `CUDA_ARCHS`:
 
 ```bash
 make -C docker release_build CUDA_ARCHS="80-real;90-real"
 ```
 
+The `run` action maps the locally checked out source code into the `/code/tensorrt_llm` directory within the container.
+
+The `DOCKER_RUN_ARGS` option can be used to pass additional options to Docker,
+e.g., in order to mount additional volumes into the container.
+
 For more build options, see the variables defined in [`Makefile`](Makefile).
 
 ### NGC Integration
@@ -62,8 +67,7 @@ make -C docker ngc-devel_run LOCAL_USER=1 DOCKER_PULL=1
 ```
 
 As before, specifying `LOCAL_USER=1` will run the container with the local user's identity. Specifying `DOCKER_PULL=1`
-is optional, but it will pull the latest image from the NGC Catalog. This will map the source code into the container
-in the directory `/code/tensorrt_llm`.
+is optional, but it will pull the latest image from the NGC Catalog.
 
 We also provide an image with pre-installed binaries for release. This can be used like so:
 
@@ -72,7 +76,15 @@ make -C docker ngc-release_run LOCAL_USER=1 DOCKER_PULL=1
 ```
 
 If you want to deploy a specific version of TensorRT-LLM, you can specify the version with
-`TRT_LLM_VERSION=<version_tag>`. The application examples and benchmarks are installed in `/app/tensorrt_llm`.
+`IMAGE_TAG=<version_tag>` (cf. [release history on GitHub](https://github.com/NVIDIA/TensorRT-LLM/releases) and [tags in NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)). The application examples and benchmarks are installed
+in `/app/tensorrt_llm`.
+
+See the description of the `<stage>_run` make target in
+[Building and Running Options](#building-and-running-options) for additional information and
+running options.
+
+If you cannot access the NGC container images, you can instead locally build and use
+equivalent containers as [described above](#building-docker-images-with-gnu-make).
 
 ### Jenkins Integration
 
@@ -91,13 +103,21 @@ Start a new container using the same image as Jenkins using your local user acco
 make -C docker jenkins_run LOCAL_USER=1
 ```
 
+If you do not have access to the [internal artifact repository](https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/), you can instead either use the [NGC Develop
+image](#ngc-integration) or [build an image locally](#building-docker-images-with-gnu-make).
+
+#### Release images based on Jenkins image
+
 One may also build a release image based on the Jenkins development image:
 
 ```bash
 make -C docker trtllm_build CUDA_ARCHS="80-real;90-real"
 ```
 
-These images can be pushed to
+Note that the above requires access to the Jenkins development image from the
+[internal artifact repository](https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/).
+
+The resulting images can be pushed to
 the [internal artifact repository](https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm-staging/release/):
 
 ```bash
@@ -112,4 +132,16 @@ make -C docker trtllm_run LOCAL_USER=1 DOCKER_PULL=1
 ```
 
 The argument `DOCKER_PULL=1` instructs `make` to pull the latest version of the image before deploying it in the container.
-By default, images are tagged by their `git` branch name and may be frequently updated.
+By default, the release images built in the above manner are tagged by their `git` branch name and may be frequently updated.
+
+### Docker rootless
+
+Some aspects require special treatment when using [Docker rootless mode](https://docs.docker.com/engine/security/rootless/). The `docker/Makefile` contains heuristics to detect Docker rootless mode. When assuming
+Docker rootless mode, the `%_run` targets in `docker/Makefile` will output
+a corresponding message. The heuristics can be overridden by specifying
+`IS_ROOTLESS=0` or `IS_ROOTLESS=1`, respectively.
+
+Since Docker rootless mode remaps the UID/GID and the remapped UIDs and GIDs
+ (typically configured in `/etc/subuid` and `/etc/subgid`) generally do not coincide
+with the local UID/GID, both IDs need to be translated using a tool like `bindfs` in order to be able to smoothly share a local working directory with any containers
+started with `LOCAL_USER=1`. In this case, the `SOURCE_DIR` and `HOME_DIR` Makefile variables need to be set to the locations of the translated versions of the TensorRT-LLM working copy and the user home directory, respectively.
@@ -1,8 +1,8 @@
 # Description
 
-TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support
+TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and supports
 state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to
-create Python and C++ runtimes that orchestrate the inference execution in performant way.
+create Python and C++ runtimes that orchestrate the inference execution in a performant way.
 
 # Overview
 
@@ -22,15 +22,15 @@ With the top-level directory of the TensorRT-LLM repository cloned to your local
 command to start the development container:
 
 ```bash
-make -C docker ngc-devel_run LOCAL_USER=1 DOCKER_PULL=1 IMAGE_TAG=x.xx.x
+make -C docker ngc-devel_run LOCAL_USER=1 DOCKER_PULL=1 IMAGE_TAG=x.y.z
 ```
 
-where `x.xx.x` is the version of the TensorRT-LLM container to use. This command pulls the specified container from the
+where `x.y.z` is the version of the TensorRT-LLM container to use (cf. [release history on GitHub](https://github.com/NVIDIA/TensorRT-LLM/releases) and [tags in NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel/tags)). This command pulls the specified container from the
 NVIDIA NGC registry, sets up the local user's account within the container, and launches it with full GPU support. The
 local source code of TensorRT-LLM will be mounted inside the container at the path `/code/tensorrt_llm` for seamless
-integration. Ensure that the image version matches the version of TensorRT-LLM in your current local git branch. Not
-specifying an `IMAGE_TAG` will attempt to resolve this automatically, but not every intermediate release might be
-accompanied by development container. In that case, use the latest version preceding the version of your development
+integration. Ensure that the image version matches the version of TensorRT-LLM in your currently checked out local git branch. Not
+specifying a `IMAGE_TAG` will attempt to resolve this automatically, but not every intermediate release might be
+accompanied by a development container. In that case, use the latest version preceding the version of your development
 branch.
 
 If you prefer launching the container directly with `docker`, you can use the following command:
@@ -44,7 +44,7 @@ docker run --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
            --workdir /code/tensorrt_llm \
            --tmpfs /tmp:exec \
            --volume .:/code/tensorrt_llm \
-           nvcr.io/nvidia/tensorrt-llm/devel:x.xx.x
+           nvcr.io/nvidia/tensorrt-llm/devel:x.y.z
 ```
 
 Note that this will start the container with the user `root`, which may leave files with root ownership in your local

@@ -1,8 +1,8 @@
 # Description
 
-TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support
+TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and supports
 state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to
-create Python and C++ runtimes that orchestrate the inference execution in performant way.
+create Python and C++ runtimes that orchestrate the inference execution in a performant way.
 
 # Overview
 
@@ -18,10 +18,10 @@ A typical command to launch the container is:
 
 ```bash
 docker run --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all \
-    		nvcr.io/nvidia/tensorrt-llm/release:x.xx.x
+    		nvcr.io/nvidia/tensorrt-llm/release:x.y.z
 ```
 
-where x.xx.x is the version of the TensorRT-LLM container to use. To sanity check, run the following command:
+where x.y.z is the version of the TensorRT-LLM container to use (cf. [release history on GitHub](https://github.com/NVIDIA/TensorRT-LLM/releases) and [tags in NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)). To sanity check, run the following command:
 
 ```bash
 python3 -c "import tensorrt_llm"
@@ -34,7 +34,7 @@ Alternatively, if you have already cloned the TensorRT-LLM repository, you can u
 run the container:
 
 ```bash
-make -C docker ngc-release_run LOCAL_USER=1 DOCKER_PULL=1 IMAGE_TAG=x.xx.x
+make -C docker ngc-release_run LOCAL_USER=1 DOCKER_PULL=1 IMAGE_TAG=x.y.z
 ```
 
 This command pulls the specified container from the NVIDIA NGC registry, sets up the local user's account within the

@@ -7,3 +7,4 @@ breathe
 pygit2
 sphinx_copybutton
 autodoc_pydantic
+sphinx-togglebutton
@@ -56,7 +56,8 @@
     'sphinxarg.ext',
     'sphinx_click',
     'sphinx_copybutton',
-    'sphinxcontrib.autodoc_pydantic'
+    'sphinxcontrib.autodoc_pydantic',
+    'sphinx_togglebutton',
 ]
 
 autodoc_pydantic_model_show_json = True
@@ -77,8 +78,30 @@
 
 myst_enable_extensions = [
     "deflist",
+    "substitution",
 ]
 
+myst_substitutions = {
+    "version":
+    version,
+    "version_quote":
+    f"`{version}`",
+    "container_tag_admonition":
+    r"""
+```{admonition} Container image tags
+:class: dropdown note
+In the example shell commands, `x.y.z` corresponds to the TensorRT-LLM container
+version to use. If omitted, `IMAGE_TAG` will default to `tensorrt_llm.__version__`
+(e.g., this documentation was generated from the {{version_quote}} source tree).
+If this does not work, e.g., because a container for the version you are
+currently working with has not been released yet, you can try using a
+container published for a previous
+[GitHub pre-release or release](https://github.com/NVIDIA/TensorRT-LLM/releases)
+(see also [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)).
+```
+    """,
+}
+
 autosummary_generate = True
 copybutton_exclude = '.linenos, .gp, .go'
 copybutton_prompt_text = ">>> |$ |# "

@@ -25,9 +25,9 @@ Welcome to TensorRT-LLM's Documentation!
 
    .. installation/overview.md
 
+   installation/containers.md
    installation/linux.md
    installation/build-from-source-linux.md
-   installation/grace-hopper.md
 
 
 .. toctree::

@@ -9,6 +9,8 @@ This document provides instructions for building TensorRT-LLM from source code o
 
 Use [Docker](https://www.docker.com) to build and run TensorRT-LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
 
+If you intend to build any TensortRT-LLM artifacts, such as any of the container images (note that there exist pre-built [develop](#build-from-source-tip-develop-container) and [release](#build-from-source-tip-release-container) container images in NGC), or the TensorRT-LLM Python wheel, you first need to clone the TensorRT-LLM repository:
+
 ```bash
 # TensorRT-LLM uses git-lfs, which needs to be installed in advance.
 apt-get update && apt-get -y install git git-lfs
@@ -26,6 +28,11 @@ There are two options to create a TensorRT-LLM Docker image. The approximate dis
 
 ### Option 1: Build TensorRT-LLM in One Step
 
+```{tip}
+:name: build-from-source-tip-release-container
+If you just want to run TensorRT-LLM, you can instead [use the pre-built TensorRT-LLM Release container images](containers).
+```
+
 TensorRT-LLM contains a simple command to create a Docker image. Note that if you plan to develop on TensorRT-LLM, we recommend using [Option 2: Build TensorRT-LLM Step-By-Step](#option-2-build-tensorrt-llm-step-by-step).
 
 ```bash
@@ -49,11 +56,16 @@ The `make` command supports the `LOCAL_USER=1` argument to switch to the local u
 
 Since TensorRT-LLM has been built and installed, you can skip the remaining steps.
 
-### Option 2: Build TensorRT-LLM Step-by-Step
+### Option 2: Container for building TensorRT-LLM Step-by-Step
 
 If you are looking for more flexibility, TensorRT-LLM has commands to create and run a development container in which TensorRT-LLM can be built.
 
-#### Create the Container
+```{tip}
+:name: build-from-source-tip-develop-container
+As an alternative to building the container image following the instructions below,
+you can pull a pre-built [TensorRT-LLM Develop container image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) from NGC (see [here](containers) for information on container tags).
+Follow the linked catalog entry to enter a new container based on the pre-built container image, with the TensorRT source repository mounted into it. You can then skip this section and continue straight to [building TensorRT-LLM](#build-tensorrt-llm).
+```
 
 **On systems with GNU `make`**
 
@@ -100,6 +112,11 @@ If you are looking for more flexibility, TensorRT-LLM has commands to create and
 
 Once inside the container, follow the next steps to build TensorRT-LLM from source.
 
+### Advanced topics
+
+For more information on building and running various TensorRT-LLM container images,
+check <https://github.com/NVIDIA/TensorRT-LLM/tree/main/docker>.
+
 ## Build TensorRT-LLM
 
 ### Option 1: Full Build with C++ Compilation
@@ -207,7 +224,7 @@ Alternatively, you can use editable installation for convenience during Python d
 TRTLLM_USE_PRECOMPILED=1 pip install -e .
 ```
 
-Setting `TRTLLM_USE_PRECOMPILED=1` enables downloading a prebuilt wheel of the version specified in `tensorrt_llm/version.py`, extracting compiled libraries into your current directory, thus skipping C++ compilation.
+Setting `TRTLLM_USE_PRECOMPILED=1` enables downloading a prebuilt wheel of the version specified in `tensorrt_llm/version.py`, extracting compiled libraries into your current directory, thus skipping C++ compilation. This version can be overridden by specifying `TRTLLM_USE_PRECOMPILED=x.y.z`.
 
 You can specify a custom URL or local path for downloading using `TRTLLM_PRECOMPILED_LOCATION`. For example, to use version 0.16.0 from PyPI:
 

@@ -0,0 +1,10 @@
+# Pre-built release container images on NGC
+
+Pre-built TensorRT-LLM releases are made available as container images
+on NGC. This is likely the simplest way to obtain TensorRT-LLM. Please refer to the [documentation in NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) for usage instructions.
+
+{{container_tag_admonition}}
+
+Containers can also be built locally, see
+<https://github.com/NVIDIA/TensorRT-LLM/tree/main/docker>
+for all related options.