Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
Expand All @@ -14,7 +14,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
Expand All @@ -31,7 +31,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
Expand All @@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

- label: "Build and publish TPU release image"
Expand All @@ -57,7 +57,7 @@ steps:
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
Expand All @@ -82,7 +82,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container

# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
docker build -t cpu-test -f docker/Dockerfile.ppc64le .

4 changes: 2 additions & 2 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ trap remove_docker_container EXIT
remove_docker_container

# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
Expand Down
1 change: 1 addition & 0 deletions .buildkite/run-gh200-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ python3 use_existing_torch.py

# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-hpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
set -ex

# Try building the docker image
docker build -t hpu-test-env -f Dockerfile.hpu .
docker build -t hpu-test-env -f docker/Dockerfile.hpu .

# Setup cleanup
# certain versions of HPU software stack have a bug that can
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi

docker build -t "${image_name}" -f Dockerfile.neuron .
docker build -t "${image_name}" -f docker/Dockerfile.neuron .

# Setup cleanup
remove_docker_container() {
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-tpu-v1-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
docker build -f docker/Dockerfile.tpu -t vllm-tpu .

# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

# Try building the docker image
docker build -t ${image_name} -f Dockerfile.xpu .
docker build -t ${image_name} -f docker/Dockerfile.xpu .

# Setup cleanup
remove_docker_container() {
Expand Down
2 changes: 1 addition & 1 deletion .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^Dockerfile
- files~=^docker/Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint-and-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

- name: Build the Docker image vllm cpu
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .

- name: Configuration of docker images, network and namespace for the kind cluster
run: |
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions docs/source/contributing/dockerfile/dockerfile.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Dockerfile

We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](#deployment-docker).

Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
Expand Down Expand Up @@ -28,7 +28,7 @@ The edges of the build graph represent:
> Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
>
> ```bash
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
> ```
>
> or in case you want to run it directly with the docker image:
Expand All @@ -43,7 +43,7 @@ The edges of the build graph represent:
> --output png \
> --dpi 200 \
> --max-label-length 50 \
> --filename Dockerfile \
> --filename docker/Dockerfile \
> --legend
> ```
>
Expand Down
2 changes: 1 addition & 1 deletion docs/source/contributing/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pytest tests/
```

:::{tip}
Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.

Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
:::
Expand Down
5 changes: 3 additions & 2 deletions docs/source/deployment/docker.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git

## Building vLLM's Docker Image from Source

You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:

```console
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
```

:::{note}
Expand All @@ -92,6 +92,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
$ python3 use_existing_torch.py
$ DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t vllm/vllm-gh200-openai:latest \
Expand Down
4 changes: 2 additions & 2 deletions docs/source/deployment/nginx.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ server {

```console
cd $vllm_root
docker build -f Dockerfile . --tag vllm
docker build -f docker/Dockerfile . --tag vllm
```

If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:

```console
cd $vllm_root
docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
```

(nginxloadbalancer-nginx-docker-network)=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
### Build image from source

```console
docker build -f Dockerfile.hpu -t vllm-hpu-env .
docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.

See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.

Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.

## Extra information

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o

### Build image from source

You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.

```console
docker build -f Dockerfile.tpu -t vllm-tpu .
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
```

Run the Docker image with the following command:
Expand Down
6 changes: 3 additions & 3 deletions docs/source/getting_started/installation/cpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ Currently, there are no pre-built CPU wheels.
### Build image from source

```console
$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .

# Launching OpenAI server
$ docker run --rm \
Expand All @@ -193,11 +193,11 @@ $ docker run --rm \
```

::::{tip}
For ARM or Apple silicon, use `Dockerfile.arm`
For ARM or Apple silicon, use `docker/Dockerfile.arm`
::::

::::{tip}
For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
::::

## Supported features
Expand Down
14 changes: 7 additions & 7 deletions docs/source/getting_started/installation/gpu/rocm.inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO

#### (Optional) Build an image with ROCm software stack

Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
If you choose to build this rocm_base image yourself, the steps are as follows.

Expand All @@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

```console
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
```

#### Build an image with vLLM

First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

```console
Expand All @@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
}
```

<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
It provides flexibility to customize the build of docker image using the following arguments:

- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
Expand All @@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

```console
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
```

To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:

```console
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
```

To run the above docker image `vllm-rocm`, use the below command:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/installation/gpu/xpu.inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
### Build image from source

```console
$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
$ docker run -it \
--rm \
--network=host \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.

```{attention}
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
```
4 changes: 2 additions & 2 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ def __init__(
) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
raise ValueError(
"VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
"module was not found."
"See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
"module was not found. See "
"https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501
"for instructions on how to install it.")

# The tokenizer version is consistent with the model version by default.
Expand Down