Skip to content

Commit c2fb252

Browse files
authored
Merge branch 'vllm-project:main' into enable_bitsandbytes_quant_rocm
2 parents d621cf6 + 1974880 commit c2fb252

File tree

61 files changed

+684
-1147
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+684
-1147
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ steps:
172172
- tests/v1/engine/test_engine_core_client.py
173173
- tests/distributed/test_symm_mem_allreduce.py
174174
commands:
175+
# https://github.com/NVIDIA/nccl/issues/1838
176+
- export NCCL_CUMEM_HOST_ENABLE=0
175177
# test with torchrun tp=2 and external_dp=2
176178
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
177179
# test with torchrun tp=2 and pp=2
@@ -349,7 +351,8 @@ steps:
349351
- python3 offline_inference/basic/embed.py
350352
- python3 offline_inference/basic/score.py
351353
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
352-
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
354+
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
355+
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
353356

354357
- label: Platform Tests (CUDA) # 4min
355358
timeout_in_minutes: 15
@@ -534,7 +537,7 @@ steps:
534537
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
535538
# we can only upgrade after this is resolved
536539
# TODO(jerryzh168): resolve the above comment
537-
- uv pip install --system torchao==0.13.0
540+
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
538541
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
539542

540543
- label: LM Eval Small Models # 53min
@@ -975,6 +978,8 @@ steps:
975978
- tests/v1/shutdown
976979
- tests/v1/worker/test_worker_memory_snapshot.py
977980
commands:
981+
# https://github.com/NVIDIA/nccl/issues/1838
982+
- export NCCL_CUMEM_HOST_ENABLE=0
978983
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
979984
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
980985
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py

.github/CODEOWNERS

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
/vllm/attention @LucasWilkinson
66
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
77
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
8-
/vllm/model_executor/layers/fused_moe @mgoin
9-
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
8+
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
9+
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
1010
/vllm/model_executor/layers/mamba @tdoublep
1111
/vllm/model_executor/model_loader @22quinn
1212
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@@ -25,7 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2525

2626
# vLLM V1
2727
/vllm/v1/attention @LucasWilkinson
28-
/vllm/v1/attention/backends/flashinfer.py @mgoin
28+
/vllm/v1/attention/backends/mla @pavanimajety
29+
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
2930
/vllm/v1/attention/backends/triton_attn.py @tdoublep
3031
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
3132
/vllm/v1/sample @22quinn @houseroad @njhill
@@ -44,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
4445
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
4546
/tests/models @DarkLight1337 @ywang96
4647
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
47-
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
48+
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
4849
/tests/test_inputs.py @DarkLight1337 @ywang96
4950
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
5051
/tests/v1/structured_output @mgoin @russellb @aarnphm

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ repos:
3838
rev: 0.9.1
3939
hooks:
4040
- id: pip-compile
41-
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
41+
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
4242
files: ^requirements/test\.(in|txt)$
4343
- repo: local
4444
hooks:

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
4949
# requirements.txt files and should be kept consistent. The ROCm torch
5050
# versions are derived from docker/Dockerfile.rocm
5151
#
52-
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
53-
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
52+
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
53+
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
5454

5555
#
5656
# Try to find python package with an executable that exactly matches

cmake/external_projects/flashmla.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ else()
1919
FetchContent_Declare(
2020
flashmla
2121
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
22-
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
22+
GIT_TAG 28417e516fcbf6257a422ba117ef5b6f44da5682
2323
GIT_PROGRESS TRUE
2424
CONFIGURE_COMMAND ""
2525
BUILD_COMMAND ""
@@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
6666
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
6767
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
6868
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
69+
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
6970
)
7071

7172
set(FlashMLA_INCLUDES

docker/Dockerfile

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# docs/contributing/dockerfile/dockerfile.md and
66
# docs/assets/contributing/dockerfile-stages-dependency.png
77

8-
ARG CUDA_VERSION=12.8.1
8+
ARG CUDA_VERSION=12.9.1
99
ARG PYTHON_VERSION=3.12
1010

1111
# By parameterizing the base images, we allow third-party to use their own
@@ -132,7 +132,9 @@ WORKDIR /workspace
132132
COPY requirements/common.txt requirements/common.txt
133133
COPY requirements/cuda.txt requirements/cuda.txt
134134
RUN --mount=type=cache,target=/root/.cache/uv \
135-
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
135+
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
136+
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
137+
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
136138
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
137139

138140
# cuda arch list used by torch
@@ -273,6 +275,7 @@ WORKDIR /vllm-workspace
273275
ENV DEBIAN_FRONTEND=noninteractive
274276
ARG TARGETPLATFORM
275277

278+
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
276279
ARG GDRCOPY_CUDA_VERSION=12.8
277280
# Keep in line with FINAL_BASE_IMAGE
278281
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
@@ -353,9 +356,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
353356
# Install vllm wheel first, so that torch etc will be installed.
354357
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
355358
--mount=type=cache,target=/root/.cache/uv \
356-
uv pip install --system dist/*.whl --verbose \
359+
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
360+
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
361+
&& uv pip install --system dist/*.whl --verbose \
357362
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
358363

364+
# TODO (huydhn): Remove this once xformers is released for 2.9.0
365+
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
366+
. /etc/environment
367+
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
368+
uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/[email protected]"
369+
BASH
370+
359371
# Install FlashInfer pre-compiled kernel cache and binaries
360372
# https://docs.flashinfer.ai/installation.html
361373
RUN --mount=type=cache,target=/root/.cache/uv \
@@ -422,6 +434,7 @@ ARG PYTHON_VERSION
422434

423435
ARG PIP_INDEX_URL UV_INDEX_URL
424436
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
437+
ARG PYTORCH_CUDA_INDEX_BASE_URL
425438

426439
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
427440
# Reference: https://github.com/astral-sh/uv/pull/1694
@@ -434,7 +447,8 @@ ENV UV_LINK_MODE=copy
434447
RUN --mount=type=cache,target=/root/.cache/uv \
435448
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
436449
if [ "$CUDA_MAJOR" -ge 12 ]; then \
437-
uv pip install --system -r requirements/dev.txt; \
450+
uv pip install --system -r requirements/dev.txt \
451+
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
438452
fi
439453

440454
# install development dependencies (for testing)

docker/Dockerfile.cpu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,13 @@ FROM base AS vllm-test-deps
199199

200200
WORKDIR /workspace/vllm
201201

202+
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
202203
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
203204
cp requirements/test.in requirements/cpu-test.in && \
204205
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
206+
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
207+
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
208+
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
205209
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
206210

207211
RUN --mount=type=cache,target=/root/.cache/uv \

docker/Dockerfile.rocm_base

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
2-
ARG TRITON_BRANCH="f9e5bf54"
2+
ARG TRITON_BRANCH="57c693b6"
33
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
4-
ARG PYTORCH_BRANCH="b2fb6885"
4+
ARG PYTORCH_BRANCH="1c57644d"
55
ARG PYTORCH_VISION_BRANCH="v0.23.0"
66
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
77
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
88
ARG FA_BRANCH="0e60e394"
99
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
10-
ARG AITER_BRANCH="2ab9f4cd"
10+
ARG AITER_BRANCH="eef23c7f"
1111
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
1212

1313
FROM ${BASE_IMAGE} AS base
-126 Bytes
Loading

docs/contributing/ci/update_pytorch_version.md

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ is ineffective.
8787
8888
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
8989
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
90-
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
90+
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
9191
when manually triggering a build on Buildkite. This branch accomplishes two things:
9292
9393
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
@@ -100,35 +100,17 @@ to warm it up so that future builds are faster.
100100

101101
## Update dependencies
102102

103-
Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
103+
Several vLLM dependencies like xFormers depend on PyTorch and need
104104
to be updated accordingly. Rather than waiting for all of them to publish new
105105
releases (which would take too much time), they can be built from
106106
source to unblock the update process.
107107

108-
### FlashInfer
109-
110-
Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
111-
112-
```bash
113-
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
114-
export FLASHINFER_ENABLE_SM90=1
115-
uv pip install --system \
116-
--no-build-isolation "git+https://github.com/flashinfer-ai/[email protected]"
117-
```
118-
119-
One caveat is that building FlashInfer from source adds approximately 30
120-
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
121-
public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
122-
team if you want to get the package published there.
123-
124108
### xFormers
125109

126-
Similar to FlashInfer, here is how to build and install xFormers from source:
127-
128110
```bash
129-
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
111+
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
130112
MAX_JOBS=16 uv pip install --system \
131-
--no-build-isolation "git+https://github.com/facebookresearch/[email protected].30"
113+
--no-build-isolation "git+https://github.com/facebookresearch/[email protected].32.post2"
132114
```
133115

134116
## Update all the different vLLM platforms

0 commit comments

Comments
 (0)