diff --git a/docker/Makefile b/docker/Makefile index a5c1e8a278c..4265b114b09 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -45,6 +45,7 @@ PYTHON_VERSION ?= NGC_STAGING_REPO ?= nvcr.io/nvstaging/tensorrt-llm NGC_REPO ?= nvcr.io/nvidia/tensorrt-llm NGC_USE_STAGING ?= 0 +NGC_AUTO_REPO ?= $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO)) define add_local_user docker build \ @@ -202,7 +203,8 @@ ngc-devel_%: IMAGE_TAG = $(TRT_LLM_VERSION) ngc-devel_push: DOCKER_BUILD_ARGS = --push ngc-devel_push: ngc-devel_build ; -ngc-devel_run: IMAGE_NAME = $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO)) +ngc-devel_run: IMAGE_NAME = $(NGC_AUTO_REPO) +ngc-devel_pull: IMAGE_NAME = $(NGC_AUTO_REPO) ngc-release_%: STAGE = release ngc-release_%: DOCKER_BUILD_OPTS = --pull --load --platform linux/$(PLATFORM) @@ -210,8 +212,11 @@ ngc-release_%: DEVEL_IMAGE = $(NGC_STAGING_REPO)/devel:$(TRT_LLM_VERSION) ngc-release_%: IMAGE_NAME = $(NGC_STAGING_REPO) ngc-release_%: IMAGE_TAG = $(TRT_LLM_VERSION)-$(PLATFORM) -ngc-release_run: IMAGE_NAME = $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO)) -ngc-release_run: WORK_DIR = /app/tensorrt_llm +ngc-release_run: WORK_DIR = /app/tensorrt_llm +ngc-release_run: IMAGE_NAME = $(NGC_AUTO_REPO) +ngc-release_run: IMAGE_TAG = $(TRT_LLM_VERSION) +ngc-release_pull: IMAGE_NAME = $(NGC_AUTO_REPO) +ngc-release_pull: IMAGE_TAG = $(TRT_LLM_VERSION) ngc-manifest_%: STAGE = release ngc-manifest_%: IMAGE_NAME = $(NGC_STAGING_REPO) diff --git a/docker/develop.md b/docker/develop.md new file mode 100644 index 00000000000..73030cd11f1 --- /dev/null +++ b/docker/develop.md @@ -0,0 +1,92 @@ +# Description + +TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support +state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to +create Python and C++ runtimes that orchestrate the inference execution in performant way. + +# Overview + +## TensorRT-LLM Develop Container + +The TensorRT-LLM Develop container includes all necessary dependencies to build TensorRT-LLM from source. It is +specifically designed to be used alongside the source code cloned from the official TensorRT-LLM repository: + +[GitHub Repository - NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) + +Full instructions for cloning the TensorRT-LLM repository can be found in +the [TensorRT-LLM Documentation](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html). + +### Running TensorRT-LLM Using Docker + +With the top-level directory of the TensorRT-LLM repository cloned to your local machine, you can run the following +command to start the development container: + +```bash +make -C docker ngc-devel_run LOCAL_USER=1 DOCKER_PULL=1 IMAGE_TAG=x.xx.x +``` + +where `x.xx.x` is the version of the TensorRT-LLM container to use. This command pulls the specified container from the +NVIDIA NGC registry, sets up the local user's account within the container, and launches it with full GPU support. The +local source code of TensorRT-LLM will be mounted inside the container at the path `/code/tensorrt_llm` for seamless +integration. Ensure that the image version matches the version of TensorRT-LLM in your current local git branch. Not +specifying an `IMAGE_TAG` will attempt to resolve this automatically, but the not every intermediate release might be +accompanied by development container. In that case, use the latest version preceding the version of your development +branch. + +If you prefer launching the container directly with `docker`, you can use the following command: + +```bash +docker run --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ + --gpus=all \ + --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \ + --env "CCACHE_BASEDIR=/code/tensorrt_llm" \ + --env "CONAN_HOME=/code/tensorrt_llm/cpp/.conan" \ + --workdir /code/tensorrt_llm \ + --tmpfs /tmp:exec \ + --volume .:/code/tensorrt_llm \ + nvcr.io/nvidia/tensorrt-llm/devel:x.xx.x +``` + +Note that this will start the container with the user `root`, which may leave files with root ownership in your local +checkout. + +### Building the TensorRT-LLM Wheel within the Container + +You can build the TensorRT-LLM Python wheel inside the development container using the following command: + +```bash +./scripts/build_wheel.py --clean --use_ccache --cuda_architectures=native +``` + +#### Explanation of Build Flags: + +- `--clean`: Clears intermediate build artifacts from prior builds to ensure a fresh compilation. +- `--use_ccache`: Enables `ccache` to optimize and accelerate subsequent builds by caching compilation results. +- `--cuda_architectures=native`: Configures the build for the native architecture of your GPU. Leave this away to build + the wheel for all supported architectures. For additional details, refer to + the [CUDA Architectures Documentation](https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES). + +For additional build options and their usage, refer to the help documentation by running: + +```bash +./scripts/build_wheel.py --help +``` + +The wheel will be built in the `build` directory and can be installed using `pip install` like so: + +```bash +pip install ./build/tensorrt_llm*.whl +``` + +For additional information on building the TensorRT-LLM wheel, refer to +the [official documentation on building from source](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html#option-1-full-build-with-c-compilation). + +### Security CVEs + +To review known CVEs on this image, refer to the Security Scanning tab on this page. + +### License + +By pulling and using the container, you accept the terms and conditions of +this [End User License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) +and [Product-Specific Terms](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/).