diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 0412c5f37952..e29eb78a9f94 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -2,8 +2,11 @@ import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Note that we have 400 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/3792 . +# Please also sync the value with the one in Dockerfile. +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 189714ebb6d7..0590dad4f311 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -25,8 +25,11 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then + # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - docker system prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune -f + # Remove huggingface model artifacts and compiler cache rm -rf "${HF_MOUNT:?}/*" rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d2b140e71850..d5d02fdeb7f4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -76,7 +76,9 @@ steps: - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_preemption + - tests/basic_correctness/test_cumem.py commands: + - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py @@ -181,7 +183,16 @@ steps: - vllm/ - tests/v1 commands: - - VLLM_USE_V1=1 pytest -v -s v1 + # split the test to avoid interference + - VLLM_USE_V1=1 pytest -v -s v1/core + - VLLM_USE_V1=1 pytest -v -s v1/engine + - VLLM_USE_V1=1 pytest -v -s v1/sample + - VLLM_USE_V1=1 pytest -v -s v1/worker + - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py + - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - VLLM_USE_V1=1 pytest -v -s v1/e2e - label: Examples Test # 25min working_dir: "/vllm-workspace/examples" @@ -477,7 +488,9 @@ steps: - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + # this test fails consistently. + # TODO: investigate and fix + # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py @@ -515,7 +528,9 @@ steps: - vllm/engine - tests/multi_step commands: - - pytest -v -s multi_step/test_correctness_async_llm.py + # this test is quite flaky + # TODO: investigate and fix. + # - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 3cb91fc0f823..bc324d8b988b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,32 +2,35 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth +/vllm/model_executor/guided_decoding @mgoin +/vllm/multimodal @DarkLight1337 @ywang96 CMakeLists.txt @tlrmchlsmth # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic +/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat # Test ownership -/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo +/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/kernels @tlrmchlsmth @WoosukKwon -/tests/quantization @mgoin @robertgshaw2-neuralmagic +/tests/quantization @mgoin @robertgshaw2-redhat /.buildkite/lm-eval-harness @mgoin @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @comaniac +/tests/multi_step @alexm-redhat @comaniac /tests/weight_loading @mgoin @youkaichao /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index 84194a2ff511..9014e26648dd --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) # Suppress potential warnings about unused manually-specified variables set(ignoreMe "${VLLM_PYTHON_PATH}") -# Prevent installation of dependencies (cutlass) by default. -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) - # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -215,6 +212,31 @@ endif() # Define extension targets # +# +# cumem_allocator extension +# + +set(VLLM_CUMEM_EXT_SRC + "csrc/cumem_allocator.cpp") + +set_gencode_flags_for_srcs( + SRCS "${VLLM_CUMEM_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + +if(VLLM_GPU_LANG STREQUAL "CUDA") + message(STATUS "Enabling cumem allocator extension.") + # link against cuda driver library + list(APPEND CUMEM_LIBS cuda) + define_gpu_extension_target( + cumem_allocator + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_CUMEM_EXT_SRC} + LIBRARIES ${CUMEM_LIBS} + USE_SABI 3.8 + WITH_SOABI) +endif() + # # _C extension # @@ -287,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -308,8 +330,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") + # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). + cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") set_gencode_flags_for_srcs( @@ -363,7 +385,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now). + # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") @@ -463,6 +485,9 @@ if(VLLM_GPU_LANG STREQUAL "HIP") endif() message(STATUS "Enabling C extension.") +if(VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_C_LIBS cuda) +endif() define_gpu_extension_target( _C DESTINATION vllm @@ -471,6 +496,7 @@ define_gpu_extension_target( COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} + LIBRARIES ${VLLM_C_LIBS} USE_SABI 3 WITH_SOABI) @@ -570,7 +596,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") endif() # vllm-flash-attn currently only supported on CUDA -if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") +if (NOT VLLM_GPU_LANG STREQUAL "CUDA") return() endif () @@ -593,7 +619,7 @@ endif() # They should be identical but if they aren't, this is a massive footgun. # # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. -# To only install vllm-flash-attn, use --component vllm_flash_attn_c. +# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). # If no component is specified, vllm-flash-attn is still installed. # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. @@ -605,43 +631,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) endif() if(VLLM_FLASH_ATTN_SRC_DIR) - FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR}) + FetchContent_Declare( + vllm-flash-attn SOURCE_DIR + ${VLLM_FLASH_ATTN_SRC_DIR} + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn + ) else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c + GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() -# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization. -set(VLLM_PARENT_BUILD ON) - -# Ensure the vllm/vllm_flash_attn directory exists before installation -install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c) - -# Make sure vllm-flash-attn install rules are nested under vllm/ -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c) -install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c) # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") -# Restore the install prefix -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c) +# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in +# case only one is built, in the case both are built redundant work is done) +install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa2_C + FILES_MATCHING PATTERN "*.py" +) -# Copy over the vllm-flash-attn python files install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm/vllm_flash_attn - COMPONENT vllm_flash_attn_c - FILES_MATCHING PATTERN "*.py" + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa3_C + FILES_MATCHING PATTERN "*.py" ) # Nothing after vllm-flash-attn, see comment about macros above diff --git a/Dockerfile b/Dockerfile index 4542bc9cf0bd..0b9f74e08dc6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,7 +52,7 @@ WORKDIR /workspace # after this step RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ fi COPY requirements-common.txt requirements-common.txt @@ -126,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py -# Default max size of the wheel is 250MB -ARG VLLM_MAX_SIZE_MB=250 +# sync the default value with .buildkite/check-wheel-size.py +ARG VLLM_MAX_SIZE_MB=300 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ @@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +# TODO: Restore to base image after FlashInfer AOT wheel fixed +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace @@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose +# How to build this FlashInfer wheel: +# $ export FLASHINFER_ENABLE_AOT=1 +# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ +# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' +# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +# $ cd flashinfer +# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 +# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose + RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ + python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ fi COPY examples examples + +# Although we build Flashinfer with AOT mode, there's still +# some issues w.r.t. JIT compilation. Therefore we need to +# install build dependencies for JIT compilation. +# TODO: Remove this once FlashInfer AOT wheel is fixed +COPY requirements-build.txt requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-build.txt + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 13b0a764cbd7..14c522afd7f9 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -14,7 +14,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev # Remove sccache -RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" ARG COMMON_WORKDIR WORKDIR ${COMMON_WORKDIR} @@ -25,7 +25,7 @@ WORKDIR ${COMMON_WORKDIR} FROM base AS fetch_vllm_0 ONBUILD COPY ./ vllm/ FROM base AS fetch_vllm_1 -ARG VLLM_REPO="https://github.com/ROCm/vllm.git" +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_BRANCH="main" ONBUILD RUN git clone ${VLLM_REPO} \ && cd vllm \ @@ -45,13 +45,35 @@ RUN cd vllm \ FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / -COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite +# ----------------------- +# Test vLLM image +FROM base AS test + +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* + +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl + +WORKDIR /vllm-workspace +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace + +# install development dependencies (for testing) +RUN cd /vllm-workspace \ + && rm -rf vllm \ + && python3 -m pip install -e tests/vllm_test_utils \ + && python3 -m pip install lm-eval[api]==0.4.4 \ + && python3 -m pip install pytest-shard # ----------------------- # Final vLLM image @@ -75,17 +97,9 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \ && cd hipMarker && python3 setup.py install ; fi # Install vLLM -# Make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ && pip install -U -r requirements-rocm.txt \ - && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.0"*) \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ - *"rocm-6.1"*) \ - cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \ - *) ;; esac \ && pip uninstall -y vllm \ && pip install *.whl @@ -93,10 +107,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks -COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite - ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false @@ -104,9 +115,5 @@ ENV TOKENIZERS_PARALLELISM=false # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 -# install development dependencies (for testing) -RUN cd ${COMMON_WORKDIR}/vllm \ - && python3 -m pip install -e tests/vllm_test_utils - CMD ["/bin/bash"] diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base new file mode 100644 index 000000000000..5bbe98b0c220 --- /dev/null +++ b/Dockerfile.rocm_base @@ -0,0 +1,158 @@ +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete +ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG LEGACY_HIPBLASLT_OPTION= +ARG RCCL_BRANCH="648a58d" +ARG RCCL_REPO="https://github.com/ROCm/rccl" +ARG TRITON_BRANCH="e5be006" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG PYTORCH_BRANCH="8d4926e" +ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" +ARG FA_BRANCH="b7d29fb" +ARG FA_REPO="https://github.com/ROCm/flash-attention.git" + +FROM ${BASE_IMAGE} AS base + +ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV ROCM_PATH=/opt/rocm +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + +ARG PYTHON_VERSION=3.12 + +RUN mkdir -p /app +WORKDIR /app +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y software-properties-common git curl sudo vim less \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-lib2to3 python-is-python3 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython + +FROM base AS build_hipblaslt +ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH +# Set to "--legacy_hipblas_direct" for ROCm<=6.2 +ARG LEGACY_HIPBLASLT_OPTION +RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN cd hipBLAS-common \ + && git checkout ${HIPBLAS_COMMON_BRANCH} \ + && mkdir build \ + && cd build \ + && cmake .. \ + && make package \ + && dpkg -i ./*.deb +RUN git clone https://github.com/ROCm/hipBLASLt +RUN cd hipBLASLt \ + && git checkout ${HIPBLASLT_BRANCH} \ + && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ + && cd build/release \ + && make package +RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install + +FROM base AS build_rccl +ARG RCCL_BRANCH +ARG RCCL_REPO +RUN git clone ${RCCL_REPO} +RUN cd rccl \ + && git checkout ${RCCL_BRANCH} \ + && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} +RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install + +FROM base AS build_triton +ARG TRITON_BRANCH +ARG TRITON_REPO +RUN git clone ${TRITON_REPO} +RUN cd triton \ + && git checkout ${TRITON_BRANCH} \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + +FROM base AS build_amdsmi +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=dist +RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install + +FROM base AS build_pytorch +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN git clone ${PYTORCH_REPO} pytorch +RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ + pip install -r requirements.txt && git submodule update --init --recursive \ + && python3 tools/amd_build/build_amd.py \ + && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${PYTORCH_VISION_REPO} vision +RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${FA_REPO} +RUN cd flash-attention \ + && git checkout ${FA_BRANCH} \ + && git submodule update --init \ + && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ + && cp /app/vision/dist/*.whl /app/install \ + && cp /app/flash-attention/dist/*.whl /app/install + +FROM base AS final +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ + && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ + && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + pip install /install/*.whl + +ARG BASE_IMAGE +ARG HIPBLASLT_BRANCH +ARG LEGACY_HIPBLASLT_OPTION +ARG RCCL_BRANCH +ARG RCCL_REPO +ARG TRITON_BRANCH +ARG TRITON_REPO +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ + && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ + && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ + && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ + && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ + && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ + && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ + && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ + && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt diff --git a/Dockerfile.tpu b/Dockerfile.tpu index b617932a85b4..e268b3947666 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20241017" +ARG NIGHTLY_DATE="20250124" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/README.md b/README.md index 658b9fb6edd8..4ed905bf7aa9 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,8 @@ Easy, fast, and cheap LLM serving for everyone --- -The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui) - ---- - *Latest News* 🔥 +- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index a9ab4fc9b621..0612e8778aca 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -35,6 +35,7 @@ class RequestFuncOutput: generated_text: str = "" success: bool = False latency: float = 0.0 + output_tokens: int = 0 ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies @@ -50,7 +51,8 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -122,7 +124,8 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -156,7 +159,7 @@ async def async_request_trt_llm( timestamp = time.perf_counter() # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -186,7 +189,8 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { @@ -234,7 +238,8 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -244,8 +249,12 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -256,7 +265,6 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: @@ -271,15 +279,16 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": data = json.loads(chunk) # NOTE: Some completion API might have a last # usage summary response without a token so we # want to check a token was generated - if data["choices"][0]["text"]: + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") timestamp = time.perf_counter() # First token if not first_chunk_received: @@ -293,7 +302,10 @@ async def async_request_openai_completions( most_recent_timestamp) most_recent_timestamp = timestamp - generated_text += data["choices"][0]["text"] + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -302,7 +314,7 @@ async def async_request_openai_completions( "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!") output.generated_text = generated_text - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -325,7 +337,8 @@ async def async_request_openai_chat_completions( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -341,8 +354,12 @@ async def async_request_openai_chat_completions( "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -368,17 +385,15 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) - delta = data["choices"][0]["delta"] - if delta.get("content", None): + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -386,13 +401,16 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) - generated_text += delta["content"] + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 53186e10c545..63d2c3f7c7dd 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,6 +25,7 @@ import argparse import asyncio import base64 +import gc import io import json import os @@ -199,7 +200,7 @@ def sample_sonnet_requests( return sampled_requests -def sample_mmmu_pro_vision_requests( +def sample_vision_arena_requests( dataset, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -211,13 +212,7 @@ def sample_mmmu_pro_vision_requests( if len(sampled_requests) == num_requests: break - # MMMU-Pro vision direct prompt - # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 - prompt = ( - "Answer with the option letter from the given choices directly. " - "The last line of your response should be of the following " - "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " - "options.") + prompt = data["turns"][0][0]['content'] prompt_token_ids = tokenizer(prompt).input_ids if fixed_output_len is None: @@ -229,10 +224,10 @@ def sample_mmmu_pro_vision_requests( output_len = fixed_output_len assert isinstance( - data["image"], + data["images"][0], Image), ("Input image format must be `PIL.Image.Image`, " f"given {type(data['image'])}.") - image: Image = data["image"] + image: Image = data["images"][0] image = image.convert("RGB") image_data = io.BytesIO() image.save(image_data, format='JPEG') @@ -251,7 +246,7 @@ def sample_mmmu_pro_vision_requests( def sample_hf_requests( dataset_path: str, - dataset_subset: str, + dataset_subset: Optional[str], dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -259,19 +254,17 @@ def sample_hf_requests( fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - # Special case for MMMU-Pro vision dataset - if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': - assert dataset_split == "test" + # Special case for vision_arena dataset + if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ + and dataset_subset is None: + assert dataset_split == "train" dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, streaming=True) - assert "image" in dataset.features, ( - "MMMU/MMMU_Pro vision dataset must have 'image' column.") - filter_func = lambda x: isinstance(x["image"], Image) - dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - return sample_mmmu_pro_vision_requests(dataset, num_requests, - tokenizer, fixed_output_len) + dataset = dataset.shuffle(seed=random_seed) + return sample_vision_arena_requests(dataset, num_requests, tokenizer, + fixed_output_len) dataset = load_dataset(dataset_path, name=dataset_subset, @@ -423,7 +416,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -436,19 +429,23 @@ def calculate_metrics( e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + output_len = outputs[i].output_tokens + + if output_len is None: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) @@ -459,21 +456,21 @@ def calculate_metrics( else: actual_output_lens.append(0) - if gootput_config_dict: + if goodput_config_dict: valid_metrics = [] slo_values = [] - if "ttft" in gootput_config_dict: + if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(gootput_config_dict["ttft"] / + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "tpot" in gootput_config_dict: + if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(gootput_config_dict["tpot"] / + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "e2el" in gootput_config_dict: + if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(gootput_config_dict["e2el"] / + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): @@ -537,7 +534,7 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: @@ -661,7 +658,7 @@ async def limited_request_func(request_func_input, pbar): tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -673,7 +670,7 @@ async def limited_request_func(request_func_input, pbar): metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) - if gootput_config_dict: + if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", @@ -688,7 +685,7 @@ async def limited_request_func(request_func_input, pbar): "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, "request_goodput:": - metrics.request_goodput if gootput_config_dict else None, + metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -744,11 +741,11 @@ def process_one_metric( def check_goodput_args(args): # Check and parse goodput arguments - gootput_config_dict = {} + goodput_config_dict = {} VALID_NAMES = ["ttft", "tpot", "e2el"] if args.goodput: - gootput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in gootput_config_dict.items(): + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): if slo_name not in VALID_NAMES: raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " @@ -759,22 +756,22 @@ def check_goodput_args(args): f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " "non-negative.") - return gootput_config_dict + return goodput_config_dict def parse_goodput(slo_pairs): - gootput_config_dict = {} + goodput_config_dict = {} try: for slo_pair in slo_pairs: slo_name, slo_val = slo_pair.split(":") - gootput_config_dict[slo_name] = float(slo_val) + goodput_config_dict[slo_name] = float(slo_val) except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " "Specify service level objectives for goodput as \"KEY:VALUE\" " "pairs, where the key is a metric name, and the value is a " "number in milliseconds.") from err - return gootput_config_dict + return goodput_config_dict def main(args: argparse.Namespace): @@ -874,7 +871,11 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - gootput_config_dict = check_goodput_args(args) + goodput_config_dict = check_goodput_args(args) + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() benchmark_result = asyncio.run( benchmark( @@ -896,7 +897,7 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, )) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 1d59a0142241..1fa0da75c79d 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -12,10 +12,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser, is_navi +from vllm.utils import FlexibleArgumentParser FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( -) and not is_navi() else torch.float8_e4m3fn +) else torch.float8_e4m3fn class BenchmarkConfig(TypedDict): diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 483584dd804e..88ef08f7a03d 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -101,7 +101,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: start_time = time.perf_counter() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, + dtype=torch.float32, + device=device) for _ in range(num_iters): if version == "v1": diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 78c3516db236..825fac8cd368 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -259,7 +259,7 @@ endmacro() # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is # in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add -# 9.0a to the result. +# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -270,34 +270,47 @@ endmacro() # function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS set(_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS) list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) + if ("9.0" IN_LIST TGT_CUDA_ARCHS_) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is - # less or eqault to ARCH - foreach(_ARCH ${CUDA_ARCHS}) - set(_TMP_ARCH) - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - set(_TMP_ARCH ${_SRC_ARCH}) - else() - break() + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${TGT_CUDA_ARCHS_}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check major-version match AND version-less-or-equal + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") endif() endforeach() - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) - endif() - endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 3569b3c88abc..87bea0f3b279 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -100,6 +100,9 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V1(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V1(64); break; diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index bc543e713fe5..fc84a6774b8e 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -114,6 +114,9 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V2(64); break; diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index ef5b14088c63..b9764056e8a2 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -460,11 +460,11 @@ void paged_attention_v1( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", @@ -782,11 +782,11 @@ void paged_attention_v2( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index 31d454328b2c..e3809acad745 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -107,10 +107,8 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, double k_scale, - double v_scale) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); - + const std::string& kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 74e4d8189d40..5d1c5f4c83d3 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -148,7 +148,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp new file mode 100644 index 000000000000..e8555d853b7a --- /dev/null +++ b/csrc/cumem_allocator.cpp @@ -0,0 +1,310 @@ +// A CUDAPluggableAllocator based on cumem* APIs. +// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle* +// need to be unsigned long long +#include + +extern "C" { + +#define PY_SSIZE_T_CLEAN +#include + +#include +#include +#include + +#define CUDA_CHECK(condition) \ + do { \ + CUresult error = condition; \ + if (error != 0) { \ + char* error_string; \ + cuGetErrorString(error, (const char**)&error_string); \ + std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } while (0) + +// Global references to Python callables +// NOTE: this is borrowed reference, so we don't need to DECREF them. +// This brings the limitation that the allocator needs to be singleton. +static PyObject* g_python_malloc_callback = nullptr; +static PyObject* g_python_free_callback = nullptr; + +// --------------------------------------------------------------------------- +// Helper functions: + +void ensure_context(unsigned long long device) { + CUcontext pctx; + CUDA_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + // Ensure device context. + CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); + CUDA_CHECK(cuCtxSetCurrent(pctx)); + } +} + +void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + ensure_context(device); + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Allocate memory using cuMemCreate + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = device; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); + // std::cout << "create_and_map: device=" << device << ", size=" << size << ", + // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; +} + +void unmap_and_release(unsigned long long device, ssize_t size, + CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + // std::cout << "unmap_and_release: device=" << device << ", size=" << size << + // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; + ensure_context(device); + CUDA_CHECK(cuMemUnmap(d_mem, size)); + CUDA_CHECK(cuMemRelease(*p_memHandle)); +} + +PyObject* create_tuple_from_c_integers(unsigned long long a, + unsigned long long b, + unsigned long long c, + unsigned long long d) { + // Create a new tuple of size 4 + PyObject* tuple = PyTuple_New(4); + if (!tuple) { + return NULL; // Return NULL on failure + } + + // Convert integers to Python objects and set them in the tuple + PyTuple_SetItem( + tuple, 0, + PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong + PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b)); + PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c)); + PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d)); + + // Note: PyTuple_SetItem "steals" a reference to each object, + // so we do not need to Py_DECREF the PyLong objects explicitly. + + return tuple; // Return the created tuple +} + +// --------------------------------------------------------------------------- +// Our exported C functions that call Python: + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void* my_malloc(ssize_t size, int device, CUstream stream) { + ensure_context(device); + + // first allocation, align the size, and reserve an address, and also allocate + // a CUmemGenericAllocationHandle + + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Check if the allocation is supported + size_t granularity; + CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; + + CUdeviceptr d_mem; + CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); + + // allocate the CUmemGenericAllocationHandle + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)malloc( + sizeof(CUmemGenericAllocationHandle)); + + if (!g_python_malloc_callback) { + std::cerr << "ERROR: g_python_malloc_callback not set.\n"; + return nullptr; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* arg_tuple = create_tuple_from_c_integers( + (unsigned long long)device, (unsigned long long)alignedSize, + (unsigned long long)d_mem, (unsigned long long)p_memHandle); + + // Call g_python_malloc_callback + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL); + Py_DECREF(arg_tuple); + + if (!py_result) { + PyErr_Print(); + PyGILState_Release(gstate); + return nullptr; + } + + PyGILState_Release(gstate); + + // do the final mapping + create_and_map(device, alignedSize, d_mem, p_memHandle); + + return (void*)d_mem; +} + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void my_free(void* ptr, ssize_t size, int device, CUstream stream) { + // get memory handle from the pointer + if (!g_python_free_callback) { + std::cerr << "ERROR: g_python_free_callback not set.\n"; + return; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* py_ptr = + PyLong_FromUnsignedLongLong(reinterpret_cast(ptr)); + + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL); + + if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, + &recv_d_mem, &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return; + } + + PyGILState_Release(gstate); + + // recv_size == size + // recv_device == device + + // Free memory + + CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + unmap_and_release(device, size, d_mem, p_memHandle); + + // free address and the handle + CUDA_CHECK(cuMemAddressFree(d_mem, size)); + free(p_memHandle); +} + +// --------------------------------------------------------------------------- +// Python extension boilerplate: + +// Python-exposed function: init_module(python_malloc, python_free) +static PyObject* py_init_module(PyObject* self, PyObject* args) { + PyObject* malloc_callback = nullptr; + PyObject* free_callback = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { + return nullptr; + } + + if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be callables"); + return nullptr; + } + + // Save the Python callables + // This module does not handle GC of these objects, so they must be kept alive + // outside of this module. + g_python_malloc_callback = malloc_callback; + g_python_free_callback = free_callback; + + Py_RETURN_NONE; +} + +static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyObject* python_create_and_map(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyMethodDef module_methods[] = { + {"init_module", (PyCFunction)py_init_module, METH_VARARGS, + "Initialize module with python_malloc and python_free callables."}, + {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS, + "Create and map memory on the device."}, + {"python_unmap_and_release", (PyCFunction)python_unmap_and_release, + METH_VARARGS, "Unmap and release memory on the device."}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef cumem_allocator_module = { + PyModuleDef_HEAD_INIT, "cumem_allocator", + "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods}; + +PyMODINIT_FUNC PyInit_cumem_allocator(void) { + // Initialize the module + PyObject* module = PyModule_Create(&cumem_allocator_module); + if (!module) { + return NULL; + } + return module; +} +} // extern "C" diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index bb3b07e34b39..8b6fe72ad743 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, } } // namespace -template +template __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, @@ -32,12 +32,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, const size_t start_idx = threadIdx.x * tokens_per_thread; extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) - int32_t* cumsum = - shared_mem + - (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) + token_cnts_t* tokens_cnts = + (token_cnts_t*)(shared_mem + num_experts + + 1); // 2d tensor with shape (blockDim.x + 1, num_experts) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -74,7 +72,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = cumsum[num_experts]; + *total_tokens_post_pad = static_cast(cumsum[num_experts]); } __syncthreads(); @@ -224,26 +222,46 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // If we have very large number of experts, we can no longer use shared - // memory. - // TODO(simon): the right solution should be calculating the exact right - // amount of shared memory and use that. The num_experts >= 256 is just a - // temporary solution to unblock Deepseek V3. - if (num_experts >= 96) { + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_i32 = + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + const int32_t shared_mem_i16 = + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + + (num_experts + 1) * sizeof(int32_t); + + bool use_global_memory = false; + bool use_i16 = false; // Use uint16_t for shared memory token counts + if (shared_mem_i32 < device_max_shared_mem) { + // Do nothing in this case. We're all set to use int32_t token counts + } else if (shared_mem_i16 < device_max_shared_mem && + topk_ids.numel() <= 65535) { + // when nelements of topk_ids is smaller than 65535 (max value of uint16), + // element value of token_cnts would also smaller than 65535, + // so we can use uint16 as dtype of token_cnts + use_i16 = true; + } else { + use_global_memory = true; + } + + if (use_global_memory) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t mem_tokens_cnts = - ((num_experts + 1) * num_experts) * sizeof(int32_t); - const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t); - // allocate global memory - int32_t* tokens_cnts; - int32_t* cumsum; - cudaMalloc(&tokens_cnts, mem_tokens_cnts); - cudaMalloc(&cumsum, mem_cumsum); + auto options_int = torch::TensorOptions() + .dtype(torch::kInt) + .device(topk_ids.device()); + torch::Tensor token_cnts_buffer = + torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto kernel = vllm::moe::moe_align_block_size_global_mem_kernel; @@ -252,25 +270,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), tokens_cnts, cumsum); - cudaFree(tokens_cnts); - cudaFree(cumsum); + topk_ids.numel(), token_cnts_buffer.data_ptr(), + cumsum_buffer.data_ptr()); }); - } else { + } else if (use_i16) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); - // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = + vllm::moe::moe_align_block_size_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem_i16)); + kernel<<<1, num_thread, shared_mem_i16, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); + }); + } else { + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<<1, num_thread, shared_mem, stream>>>( + (void*)kernel, shared_mem_i32)); + kernel<<<1, num_thread, shared_mem_i32, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index ab8edd6d0f57..1e50e932268f 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -995,7 +995,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, // head_size] OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, const float* k_scale_ptr, const float* v_scale_ptr, + int max_ctx_blocks, const float* k_scale, const float* v_scale, const float* __restrict__ fp8_out_scale_ptr) { UNREACHABLE_CODE } diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md index 76b2fb95a500..21bd938be9e8 100644 --- a/docs/source/api/multimodal/inputs.md +++ b/docs/source/api/multimodal/inputs.md @@ -43,7 +43,7 @@ ``` ```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 +.. autoclass:: vllm.multimodal.inputs.MultiModalInputs :members: :show-inheritance: ``` diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md new file mode 100644 index 000000000000..e8030edfa02e --- /dev/null +++ b/docs/source/community/blog.md @@ -0,0 +1,3 @@ +# vLLM Blog + +vLLM blog posts are published [here](https://blog.vllm.ai/). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md index 43fa9ee61609..ab5ea147f4c6 100644 --- a/docs/source/community/meetups.md +++ b/docs/source/community/meetups.md @@ -4,6 +4,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) - [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index 422dc13e6a64..a9bbfde2af77 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -41,3 +41,20 @@ You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) to discuss security-related topics. However, please do not disclose any vulnerabilities in this channel. If you need to report a vulnerability, please use the GitHub security advisory system or contact a VMT member privately. + +## Vulnerability Disclosure + +The process for disclosing vulnerabilities is the following: + +- The VMT will work with the project maintainers to develop a fix for the + vulnerability. +- The VMT will coordinate with the reporter and project maintainers to prepare a + security advisory that adequately describes the vulnerability and its impact. +- The VMT will coordinate with the project maintainers to publish a fix and + release an update that includes that fix. +- The VMT will publish the security advisory on GitHub. Release notes will be + updated to include a reference to the security advisory. + +The VMT and project maintainers will work to minimize the amount of time in +between disclosing any public information about the vulnerability and making a +release and advisory available. diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 86a82eb36df3..47ab616b3068 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -307,7 +307,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ? - ? - - ✅ + - [✗](gh-issue:11484) - ✅ - ✗ - ? diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md deleted file mode 100644 index bdc6d9da11ab..000000000000 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ /dev/null @@ -1,41 +0,0 @@ -(fp8-e4m3-kvcache)= - -# FP8 E4M3 KV Cache - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -```console -pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo -``` - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - -Here is an example of how to enable this feature: - -```python -# To calculate kv cache scales on the fly enable the calculate_kv_scales -# parameter - -from vllm import LLM, SamplingParams -sampling_params = SamplingParams(temperature=1.3, top_p=0.8) -llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) -prompt = "London is the capital of" -out = llm.generate(prompt, sampling_params)[0].outputs[0].text -print(out) -``` diff --git a/docs/source/features/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md deleted file mode 100644 index 3a81ab17f332..000000000000 --- a/docs/source/features/quantization/fp8_e5m2_kvcache.md +++ /dev/null @@ -1,31 +0,0 @@ -(fp8-kv-cache)= - -# FP8 E5M2 KV Cache - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -```python -from vllm import LLM, SamplingParams -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 861cb165c11c..56ccdb5f00c3 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -14,6 +14,5 @@ bnb gguf int8 fp8 -fp8_e5m2_kvcache -fp8_e4m3_kvcache +quantized_kvcache ``` diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md new file mode 100644 index 000000000000..9f36c2949e0d --- /dev/null +++ b/docs/source/features/quantization/quantized_kvcache.md @@ -0,0 +1,147 @@ +(quantized-kvcache)= + +# Quantized KV Cache + +## FP8 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput. + +### FP8 Formats + +[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats: + +- E5M2 (5 exponent bits and 2 mantissa bits) +- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3) + +The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor. + +### Current Limitations + +For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). + +### Performance Impact + +The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: + +- Processing longer context lengths for individual requests, or +- Handling more concurrent request batches + +However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized. + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization. + +## Usage Example + +Here is an example of how to enable FP8 quantization: + +```python +# To calculate kv cache scales on the fly enable the calculate_kv_scales +# parameter + +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True) +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` + +The `kv_cache_dtype` argument specifies the data type for KV cache storage: +- `"auto"`: Uses the model's default "unquantized" data type +- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU) +- `"fp8_e5m2"`: Supported on CUDA 11.8+ + +## Calibrated Scales for Better Accuracy + +For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process. + +### Installation + +First, install the required dependencies: + +```console +pip install llmcompressor +``` + +### Example Usage + +Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): + +```python +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.transformers import oneshot + +# Select model and load it +MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Configure calibration parameters +NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess dataset +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def process_and_tokenize(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + text, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + +ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) + +# Configure quantization settings +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true +""" + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save quantized model +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. + +When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. + +```python +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index b4695d504b60..ae42dd0c0d08 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -59,6 +59,7 @@ To build and install vLLM from source, run: ```console git clone https://github.com/vllm-project/vllm.git cd vllm +pip install -r requirements-hpu.txt python setup.py develop ``` @@ -68,6 +69,7 @@ Currently, the latest features and performance optimizations are developed in Ga git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork git checkout habana_main +pip install -r requirements-hpu.txt python setup.py develop ``` diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index 4256027e6c40..69238f6e36fb 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -13,6 +13,14 @@ vLLM supports AMD GPUs with ROCm 6.2. Currently, there are no pre-built ROCm wheels. +However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized +docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. + +```{tip} +Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) +for instructions on how to use this prebuilt docker image. +``` + ### Build wheel from source 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): @@ -123,11 +131,10 @@ It is important that the user kicks off the docker build using buildkit. Either uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. +- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using +- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build +- `BUILD_RPD`: Include RocmProfileData profiling tool in the image +- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image Their values can be passed in when running `docker build` with `--build-arg` options. @@ -137,10 +144,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 1e290d2b4c0b..7bfe9b4036ad 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -22,9 +22,9 @@ It'd be better to store the model in a local disk. Additionally, have a look at To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. ``` -## Model is too large +## Out of memory -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging @@ -197,6 +197,63 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` +## `torch.compile` Error + +vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: + +```python +import torch + +@torch.compile +def f(x): + # a simple function to test torch.compile + x = x + 1 + x = x * 2 + x = x.sin() + return x + +x = torch.randn(4, 4).cuda() +print(f(x)) +``` + +If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. + +## Model failed to be inspected + +If you see an error like: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. +``` + +It means that vLLM failed to import the model file. +Usually, it is related to missing dependencies or outdated binaries in the vLLM build. +Please read the logs carefully to determine the root cause of the error. + +## Model not supported + +If you see an error like: + +```text +Traceback (most recent call last): +... + File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls + for arch in architectures: +TypeError: 'NoneType' object is not iterable +``` + +or: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] +``` + +But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model. + ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). diff --git a/docs/source/index.md b/docs/source/index.md index d7a1117df9c2..2c302d3f3e86 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -184,6 +184,7 @@ api/model/index :caption: Community :maxdepth: 1 +community/blog community/meetups community/sponsors ``` diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3da5aaf713c1..8cdc663a0320 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -302,8 +302,8 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Phi3ForCausalLM` - - Phi-3 - - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + - Phi-4, Phi-3 + - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - ✅︎ - ✅︎ * - `Phi3SmallForCausalLM` diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 1f5a54f755f1..8a18598665a7 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -31,6 +31,8 @@ Please refer to the above pages for more details about each API. This section lists the most common options for running the vLLM engine. For a full list, refer to the [Engine Arguments](#engine-args) page. +(model-resolution)= + ### Model resolution vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository @@ -41,37 +43,6 @@ Nevertheless, our model resolution may fail for the following reasons: - Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. - The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. -In those cases, vLLM may throw an error like: - -```text -Traceback (most recent call last): -... - File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls - for arch in architectures: -TypeError: 'NoneType' object is not iterable -``` - -or: - -```text - File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported - raise ValueError( -ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] -``` - -:::{note} -The above error is distinct from the following similar but different error: - -```text - File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported - raise ValueError( -ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. -``` - -This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated -binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error. -::: - To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. For example: diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index e49bbb06695f..8bc234545bef 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -50,6 +50,11 @@ In addition, we have the following custom APIs: - Applicable to all [pooling models](../models/pooling_models.md). - [Score API](#score-api) (`/score`) - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= @@ -473,3 +478,90 @@ The following extra parameters are supported: :start-after: begin-score-extra-params :end-before: end-score-extra-params ``` + +(rerank-api)= + +### Re-rank API + +Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and +each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on +a scale of 0 to 1. + +You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the +`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank` +endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and +[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with +popular open-source tools. + +Code example: + +#### Example Request + +Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. +Result documents will be sorted by relevance, and the `index` property can be used to determine original order. + +Request: + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/rerank' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Horses and cows are both animals" + ] +}' +``` + +Response: + +```bash +{ + "id": "rerank-fae51b2b664d4ed38f5969b612edff77", + "model": "BAAI/bge-reranker-base", + "usage": { + "total_tokens": 56 + }, + "results": [ + { + "index": 1, + "document": { + "text": "The capital of France is Paris." + }, + "relevance_score": 0.99853515625 + }, + { + "index": 0, + "document": { + "text": "The capital of Brazil is Brasilia." + }, + "relevance_score": 0.0005860328674316406 + } + ] +} +``` + +#### Extra parameters + +The following [pooling parameters](#pooling-params) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-pooling-params +:end-before: end-rerank-pooling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-extra-params +:end-before: end-rerank-extra-params +``` diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md index a4774e57cd9a..953e6ef130f1 100644 --- a/examples/offline_inference/openai/openai_batch.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines. Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. ```{note} -We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). +We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon). ``` ## Pre-requisites @@ -203,3 +203,34 @@ $ cat results.jsonl {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} ... ``` + +## Example 5: Using score endpoint + +### Additional prerequisites + +* Ensure you are using `vllm >= 0.7.0`. + +### Step 1: Create your batch file + +Add score requests to your batch file. The following is an example: + +``` +{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +``` + +You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model). + +### Step 2: Run the batch + +You can run the batch using the same command as in earlier examples. + +### Step 3: Check your results + +You can check your results by running `cat results.jsonl` + +``` +$ cat results.jsonl +{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +``` diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index f9048c7735eb..415439e88ed5 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -28,9 +28,10 @@ def run_aria(question: str, modality: str): llm = LLM(model=model_name, max_model_len=4096, max_num_seqs=2, + dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = (f"<|im_start|>user\n<|img|>\n{question}" + prompt = (f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py new file mode 100644 index 000000000000..a07affe3351c --- /dev/null +++ b/examples/online_serving/cohere_rerank_client.py @@ -0,0 +1,32 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +the Cohere SDK: https://github.com/cohere-ai/cohere-python + +run: vllm serve BAAI/bge-reranker-base +""" +import cohere + +# cohere v1 client +co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") +rerank_v1_result = co.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(rerank_v1_result) + +# or the v2 +co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") + +v2_rerank_result = co2.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(v2_rerank_result) diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py new file mode 100644 index 000000000000..bf4de76ddf36 --- /dev/null +++ b/examples/online_serving/jinaai_rerank_client.py @@ -0,0 +1,33 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +Jina and Cohere https://jina.ai/reranker + +run: vllm serve BAAI/bge-reranker-base +""" +import json + +import requests + +url = "http://127.0.0.1:8000/rerank" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + +data = { + "model": + "BAAI/bge-reranker-base", + "query": + "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Horses and cows are both animals" + ] +} +response = requests.post(url, headers=headers, json=data) + +# Check the response +if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) +else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) diff --git a/requirements-common.txt b/requirements-common.txt index 6c390bcfd18e..7051ca8cb50c 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.11 # Requires pytorch +outlines == 0.1.11 lark == 1.2.2 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch +compressed-tensors == 0.9.0 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py diff --git a/requirements-hpu.txt b/requirements-hpu.txt index f4fb89ef4283..63a5f8b18f6b 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -3,7 +3,7 @@ # Dependencies for HPU code ray -triton +triton==3.1.0 pandas tabulate setuptools>=61 diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 8ab18b3770ae..51a0c65eac5a 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -10,16 +10,17 @@ wheel jinja2 ray[default] -# Install torch_xla ---pre ---extra-index-url https://download.pytorch.org/whl/nightly/cpu +# Install torch, torch_xla --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241126+cpu -torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -jaxlib==0.4.36.dev20241122 -jax==0.4.36.dev20241122 +# Note: This torch whl can be slightly different from the official torch nightly whl +# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue +# if some change between the 2 commits introduce some C++ API change. +# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue. +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 4047e34aff3a..16c9838802b0 --- a/setup.py +++ b/setup.py @@ -228,8 +228,11 @@ def target_name(s: str) -> str: # CMake appends the extension prefix to the install path, # and outdir already contains that prefix, so we need to remove it. + # We assume only the final component of extension prefix is added by + # CMake, this is currently true for current extensions but may not + # always be the case. prefix = outdir - for i in range(ext.name.count('.')): + if '.' in ext.name: prefix = prefix.parent # prefix here should actually be the same for all components @@ -298,9 +301,11 @@ def run(self) -> None: files_to_copy = [ "vllm/_C.abi3.so", "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/__init__.py", + "vllm/cumem_allocator.abi3.so", # "vllm/_version.py", # not available in nightly wheels yet ] file_members = filter(lambda x: x.filename in files_to_copy, @@ -549,7 +554,7 @@ def _read_requirements(filename: str) -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cuda.txt") + requirements = _read_requirements("requirements-cpu.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") @@ -593,8 +598,12 @@ def _read_requirements(filename: str) -> List[str]: ext_modules.append(CMakeExtension(name="vllm._gradlib_C")) if _is_cuda(): - ext_modules.append( - CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) + ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) + if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"): + # FA3 requires CUDA 12.0 or later + ext_modules.append( + CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) + ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 83c71b5cf6eb..91ac35dd67bb 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int, worker_use_ray: bool): +def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() commands = [ - sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m", "--host", - "127.0.0.1", "--tokenizer-pool-size", - str(tokenizer_pool_size) + sys.executable, + "-u", + str(script_path), + "--model", + "facebook/opt-125m", + "--host", + "127.0.0.1", + "--tokenizer-pool-size", + str(tokenizer_pool_size), + "--distributed-executor-backend", + distributed_executor_backend, ] - if worker_use_ray: - commands.append("--worker-use-ray") uvicorn_process = subprocess.Popen(commands) yield uvicorn_process.terminate() @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) -@pytest.mark.parametrize("worker_use_ray", [False, True]) +@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, tokenizer_pool_size: int, - worker_use_ray: bool): + distributed_executor_backend: str): """ Run the API server and test it. diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 31a101e48e02..23285040642a 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -61,9 +61,10 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + if backend in ("XFORMERS", + "FLASHINFER") and model == "google/gemma-2-2b-it": pytest.skip( - "XFORMERS does not support gemma2 with full context length.") + f"{backend} does not support gemma2 with full context length.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py new file mode 100644 index 000000000000..53f4ef08f36a --- /dev/null +++ b/tests/basic_correctness/test_cumem.py @@ -0,0 +1,112 @@ +import torch + +from vllm import LLM, SamplingParams +from vllm.device_allocator.cumem import CuMemAllocator +from vllm.utils import GiB_bytes + +from ..utils import fork_new_process_for_each_test + + +@fork_new_process_for_each_test +def test_basic_cumem(): + # some tensors from default memory pool + shape = (1024, 1024) + x = torch.empty(shape, device='cuda') + x.zero_() + + # some tensors from custom memory pool + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + # custom memory pool + y = torch.empty(shape, device='cuda') + y.zero_() + y += 1 + z = torch.empty(shape, device='cuda') + z.zero_() + z += 2 + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + +@fork_new_process_for_each_test +def test_cumem_with_cudagraph(): + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + weight = torch.eye(1024, device='cuda') + with allocator.use_memory_pool(tag="discard"): + cache = torch.empty(1024, 1024, device='cuda') + + def model(x): + out = x @ weight + cache[:out.size(0)].copy_(out) + return out + 1 + + x = torch.empty(128, 1024, device='cuda') + + # warmup + model(x) + + # capture cudagraph + model_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(model_graph): + y = model(x) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # after waking up, the content in the weight tensor + # should be restored, but the content in the cache tensor + # should be discarded + + # this operation is also compatible with cudagraph + + x.random_() + model_graph.replay() + + # cache content is as expected + assert torch.allclose(x, cache[:x.size(0)]) + + # output content is as expected + assert torch.allclose(y, x + 1) + + +@fork_new_process_for_each_test +def test_end_to_end(): + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) + assert used_bytes < 2 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 4e502cfb5f4f..4b27dcbc8609 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -29,10 +29,10 @@ def check_settings(): @pytest.fixture -def worker_use_ray() -> bool: - # When SPMD worker is used, use ray_use_worker=True +def distributed_executor_backend() -> str: + # When SPMD worker is used, use distributed_executor_backend="ray" # to test delta input optimization works with preemption. - return envs.VLLM_USE_RAY_SPMD_WORKER + return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" @pytest.mark.parametrize("model", MODELS) @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute( dtype: str, max_tokens: int, chunked_prefill_token_size: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Ensure that chunked prefill works with preemption.""" max_num_seqs = min(chunked_prefill_token_size, 256) @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, max_num_seqs=max_num_seqs, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) @@ -93,7 +93,7 @@ def test_preemption( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """By default, recompute preemption is enabled""" @@ -104,7 +104,7 @@ def test_preemption( model, dtype=dtype, disable_log_stats=False, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -144,7 +144,7 @@ def test_preemption_infeasible( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Verify infeasible preemption request will be ignored.""" BLOCK_SIZE = 16 @@ -159,7 +159,7 @@ def test_preemption_infeasible( # ignored instead of hanging forever. num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 87d5aefea6cb..1945479fc303 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -58,7 +58,7 @@ class TestSetting: model_args=["--task", "embed"], pp_size=1, tp_size=1, - attn_backend="FLASHINFER", + attn_backend="FLASH_ATTN", method="encode", fullgraph=True, ), diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 29ac3a3c86cb..6642174c17d8 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -796,6 +796,44 @@ def test_find_cached_blocks_prefix(): block_hashes=block_hashes_seq1) assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + # Test reset prefix cache + @staticmethod + @pytest.mark.parametrize("num_blocks", [10]) + @pytest.mark.parametrize("block_size", [16]) + def test_reset_prefix_cache(num_blocks: int, block_size: int): + """This test case simulates the case of resetting the prefix cache.""" + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(3 * block_size)) + + first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + + # Free each block in the first chain. + for block in first_chain: + allocator.free(block) + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() > 0.0 + + # Free each block in the second chain. + for block in second_chain: + allocator.free(block) + + # Reset prefix cache. + assert allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() == 0.0 + @staticmethod def create_immutable_chain( block_size: int, diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6523c8b6297c..469a5fb039fb 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -16,6 +16,24 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +@pytest.fixture(scope="module", params=[True, False]) +def use_v1(request): + # Module-scoped variant of run_with_both_engines + # + # Use this fixture to run a test with both v0 and v1, and + # also to conditionalize the test logic e.g. + # + # def test_metrics_exist(use_v1, server, client): + # ... + # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS + # for metric in expected: + # assert metric in response.text + # + # @skip_v1 wouldn't work here because this is a module-level + # fixture - per-function decorators would have no effect + yield request.param + + @pytest.fixture(scope="module") def default_server_args(): return [ @@ -36,10 +54,12 @@ def default_server_args(): "--enable-chunked-prefill", "--disable-frontend-multiprocessing", ]) -def server(default_server_args, request): +def server(use_v1, default_server_args, request): if request.param: default_server_args.append(request.param) - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0') + with RemoteOpenAIServer(MODEL_NAME, default_server_args, + env_dict=env_dict) as remote_server: yield remote_server @@ -84,7 +104,9 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "swap_space_bytes", ] +EXPECTED_METRICS_V1 = [ + "vllm:num_requests_running", + "vllm:num_requests_waiting", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", @@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer, response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK - for metric in EXPECTED_METRICS: + for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): assert metric in response.text -def test_metrics_exist_run_batch(): +def test_metrics_exist_run_batch(use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 base_url = "0.0.0.0" diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py new file mode 100644 index 000000000000..cfd8f3313396 --- /dev/null +++ b/tests/entrypoints/openai/test_rerank.py @@ -0,0 +1,87 @@ +import pytest +import requests + +from vllm.entrypoints.openai.protocol import RerankResponse + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "BAAI/bge-reranker-base" + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--max-model-len", "100"] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_top_n(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Cross-encoder models are neat" + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + "top_n": 2 + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str): + + query = "What is the capital of France?" * 100 + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents + }) + assert rerank_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + rerank_response.text \ No newline at end of file diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 097d6b1a3234..1f8a56bb43ac 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,3 +1,4 @@ +import json import subprocess import sys import tempfile @@ -21,6 +22,9 @@ {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}} {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" + def test_empty_file(): with tempfile.NamedTemporaryFile( @@ -102,3 +106,36 @@ def test_embeddings(): # Ensure that the output format conforms to the openai api. # Validation should throw if the schema is wrong. BatchRequestOutput.model_validate_json(line) + + +def test_score(): + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INPUT_SCORE_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, + "-m", + "vllm.entrypoints.openai.run_batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "BAAI/bge-reranker-v2-m3", + ], ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + # Ensure that there is no error in the response. + line_dict = json.loads(line) + assert isinstance(line_dict, dict) + assert line_dict["error"] is None diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 06e0f93dbe26..0d19615bc0d9 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -10,12 +10,7 @@ @pytest.fixture(scope="module") def server(): - args = [ - "--enforce-eager", - # Will be used on tests to compare prompt input length - "--max-model-len", - "100" - ] + args = ["--enforce-eager", "--max-model-len", "100"] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 85f485364a41..e88d6c3c6782 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens(): assert mock_engine.generate.call_args.args[1].max_tokens == 10 + # Setting server's max_tokens in the generation_config.json + # lower than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 10 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test Case 1: No max_tokens specified in request + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 15 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + + # Setting server's max_tokens in the generation_config.json + # higher than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 200 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test case 1: No max_tokens specified, defaults to context_window + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 100 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + def test_serving_chat_could_load_correct_generation_config(): diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 014d509c532a..effb59fbaf22 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -182,7 +182,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the paged attention kernel. output = torch.empty_like(query) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 1667e53c0749..2c08e928f9c2 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -210,7 +210,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) tp_rank = 0 # Call the paged attention kernel. diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 40550ed51e2c..c848be4f9d80 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -160,7 +160,7 @@ def test_reshape_and_cache( cloned_value_cache = value_cache.clone() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache, @@ -258,8 +258,8 @@ def test_reshape_and_cache_flash( del key_caches del value_caches - k_scale = key.amax().item() / 256 - v_scale = value.amax().item() / 256 + k_scale = (key.amax() / 256.0).to(torch.float32) + v_scale = (value.amax() / 256.0).to(torch.float32) # Clone the KV caches. if kv_cache_dtype == "fp8": @@ -284,12 +284,12 @@ def test_reshape_and_cache_flash( result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) ops.convert_fp8(result_key_cache, key_cache, - k_scale, + k_scale.item(), kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) ops.convert_fp8(result_value_cache, value_cache, - v_scale, + v_scale.item(), kv_dtype=kv_cache_dtype) # Run the reference implementation. diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py old mode 100644 new mode 100755 index 45ec6df4e711..8edfde42ede7 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -6,7 +6,9 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import (cascade_attention, merge_attn_states) -from vllm.vllm_flash_attn import flash_attn_varlen_func +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 192, 256] @@ -78,6 +80,7 @@ def test_merge_kernel( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("soft_cap", [None, 50]) @pytest.mark.parametrize("num_blocks", [2048]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_cascade( seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], @@ -87,8 +90,13 @@ def test_cascade( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") + current_platform.seed_everything(0) window_size = (-1, -1) @@ -118,9 +126,7 @@ def test_cascade( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, num_blocks, @@ -140,7 +146,7 @@ def test_cascade( k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens_tensor, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -154,10 +160,8 @@ def test_cascade( assert all(common_prefix_len < kv_len for kv_len in kv_lens) cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32) - cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) - cu_suffix_kv_lens = ( - cu_kv_lens - - torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) + prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32) + suffix_kv_lens = kv_lens_tensor - common_prefix_len output = torch.empty_like(query) cascade_attention( output=output, @@ -167,8 +171,8 @@ def test_cascade( cu_query_lens=cu_query_lens, max_query_len=max_query_len, cu_prefix_query_lens=cu_prefix_query_lens, - cu_prefix_kv_lens=cu_prefix_kv_lens, - cu_suffix_kv_lens=cu_suffix_kv_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, max_kv_len=max_kv_len, softmax_scale=scale, alibi_slopes=None, @@ -176,6 +180,7 @@ def test_cascade( logits_soft_cap=soft_cap if soft_cap is not None else 0, block_table=block_tables, common_prefix_len=common_prefix_len, + fa_version=fa_version, ) # Compare the results. diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index afe53797322f..c3eddacec272 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -2,7 +2,7 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Optional, Type +from typing import Type import pytest import torch @@ -11,6 +11,8 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +from .utils import baseline_scaled_mm, to_fp8, to_int8 + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), @@ -41,34 +43,10 @@ capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - def rand_int8(shape: tuple, device: str = "cuda"): return to_int8(torch.rand(shape, device=device) * 255 - 128) -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - def cutlass_fp8_gemm_helper(m: int, n: int, k: int, diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py new file mode 100644 index 000000000000..56495df34aa6 --- /dev/null +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -0,0 +1,214 @@ +"""Tests for sparse cutlass kernels + +Run `pytest tests/kernels/test_semi_structured.py`. +""" +from typing import Tuple, Type + +import pytest +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + sparse_cutlass_supported) +from vllm.platforms import current_platform + +from .utils import baseline_scaled_mm, to_fp8, to_int8 + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +capability = current_platform.get_device_capability() +capability = capability[0] * 10 + capability[1] + + +def to_bf16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.bfloat16) + + +def to_fp16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.float16) + + +def prune_to_2_4(tensor): + # Reshape tensor to [N, 4] where N is number of groups of 4 + original_shape = tensor.shape + reshaped = tensor.reshape(-1, 4) + + # Get indices of top 2 absolute values in each group of 4 + _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) + + # Create binary mask + mask = torch.zeros_like(reshaped) + mask.scatter_(dim=1, + index=indices, + src=torch.ones_like(indices, dtype=mask.dtype)) + + # Apply mask and reshape back + pruned = reshaped * mask + + # Turn all -0.0 to 0.0 + pruned[pruned == -0.0] = 0.0 + + return pruned.reshape(original_shape) + + +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + b = prune_to_2_4(b.t()).t() + + if dtype == torch.int8: + a, b = to_int8(a), to_int8(b) + elif dtype == torch.float8_e4m3fn: + a, b = to_fp8(a), to_fp8(b) + elif dtype == torch.float16: + a, b = to_fp16(a), to_fp16(b) + elif dtype == torch.bfloat16: + a, b = to_bf16(a), to_bf16(b) + else: + raise ValueError("unsupported dtype") + + b_compressed, e = ops.cutlass_sparse_compress(b.t()) + + # Compressed B, Metadata, Original A, B + return b_compressed, e, a, b + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +# Test working with a subset of A and B for sparse matmul +def test_cutlass_sparse_subset(): + + big_m = 1024 + m, n, k = 512, 512, 512 + + # Create tensors + b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, + big_m, n, k) + a = whole_a[0:m, 0:k] + scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 512), + (16, 256, 512), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 512), + (64, 16384, 1024), + (100, 8192, 512), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + + +# Test working with a subset of A and B for sparse matmul +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) + scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32) + scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=dtype) + baseline = F.linear(a, b.T) + + torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.skipif(not current_platform.has_device_capability(89), + reason="FP8 is not supported on this GPU type.") +def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m,k,n", MNK_FACTORS) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool, + per_out_ch: bool, use_bias: bool): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 1ae78d7b46c5..0ee0bf6c6a37 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -4,8 +4,10 @@ import torch from vllm.platforms import current_platform -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + flash_attn_with_kvcache, + is_fa_version_supported) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 256] @@ -80,6 +82,7 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_flash_attn_with_paged_kv( use_out: bool, @@ -91,8 +94,13 @@ def test_flash_attn_with_paged_kv( soft_cap: Optional[float], num_blocks: int, sliding_window: Optional[int], + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] @@ -131,6 +139,7 @@ def test_flash_attn_with_paged_kv( cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, window_size=window_size, + fa_version=fa_version, ) output = output if not use_out else out output = output.squeeze(1) @@ -159,6 +168,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_varlen_with_paged_kv( use_out: bool, @@ -170,8 +180,12 @@ def test_varlen_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] @@ -198,9 +212,7 @@ def test_varlen_with_paged_kv( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, @@ -215,7 +227,7 @@ def test_varlen_with_paged_kv( v=value_cache, out=out, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -223,6 +235,7 @@ def test_varlen_with_paged_kv( window_size=window_size, block_table=block_tables, softcap=soft_cap if soft_cap is not None else 0, + fa_version=fa_version, ) output = output if not use_out else out diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index a2c8f7166573..1645ef911d69 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype) - - output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap) + + output = wrapper.run(query, key_value_cache) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward( + output = wrapper.run( query, key_value_cache, - logits_soft_cap=soft_cap, ) ref_output = ref_paged_attn(query=query, @@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv( num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) ref_output = ref_paged_attn(query=query, key_cache=key_cache.squeeze(1), @@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( del query del block_tables # verify prefill fp8 - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv( wrapper = flashinfer.\ BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype, - q_data_type=dtype) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py new file mode 100644 index 000000000000..eab874e9e02b --- /dev/null +++ b/tests/kernels/test_mha_attn.py @@ -0,0 +1,126 @@ +""" +Test: + +* Tests for MultiHeadAttention layer +""" +from unittest.mock import patch + +import pytest +import torch + +from vllm.attention.layer import MultiHeadAttention +from vllm.attention.selector import _Backend, _cached_get_attn_backend +from vllm.platforms import current_platform +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.rocm import RocmPlatform + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +def test_mha_attn_platform(device: str): + """ + Test the attention selector between different platform and device. + """ + torch.set_default_dtype(torch.float16) + + if device == "cpu": + with patch("vllm.attention.selector.current_platform", CpuPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + elif device == "hip": + with patch("vllm.attention.selector.current_platform", RocmPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + else: + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 72, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + +def ref_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, +) -> torch.Tensor: + """ + Native implementation of scaled dot product attention without mask: + - query, key, value: [batch_size, seq_len, num_heads, head_size] + - attn_mask: [batch_size, seq_len, seq_len] + """ + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + attn_weights = scale * torch.matmul(query, key.transpose(2, 3)) + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.matmul(attn_weights, value).transpose(1, 2) + return out + + +BATCH_SIZES = [1, 16] +SEQ_LENS = [1] +NUM_HEADS = [1, 16] +NUM_KV_HEADS = [1] +HEAD_SIZES = [64, 80] +# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] +CUDA_DEVICES = ["cuda"] + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_mha_attn_forward( + batch_size: int, + seq_len: int, + num_heads: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: str, +): + current_platform.seed_everything(0) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + q = torch.randn(batch_size, seq_len, num_heads * head_size) + k = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + v = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + scale = 1.0 / head_size**0.5 + attn = MultiHeadAttention(num_heads, + head_size, + scale=scale, + num_kv_heads=num_kv_heads) + output = attn(q, k, v) + + assert num_heads % num_kv_heads == 0 + num_queries_per_kv = num_heads // num_kv_heads + q = q.reshape(batch_size, seq_len, num_heads, head_size) + k = k.reshape(batch_size, seq_len, num_kv_heads, head_size) + v = v.reshape(batch_size, seq_len, num_kv_heads, head_size) + if num_queries_per_kv > 1: + k = torch.repeat_interleave(k, num_queries_per_kv, dim=2) + v = torch.repeat_interleave(v, num_queries_per_kv, dim=2) + + ref_output = ref_attention( + q, + k, + v, + scale=scale, + ).reshape(batch_size, seq_len, num_heads * head_size) + torch.testing.assert_close(output, ref_output) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3fdb7996ba4e..10e73ab950b0 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -138,6 +138,7 @@ def test_contexted_kv_attention( # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -153,6 +154,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() start_time = time.time() @@ -168,6 +171,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() end_time = time.time() @@ -366,6 +371,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -381,6 +387,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() start_time = time.time() @@ -396,6 +404,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() end_time = time.time() diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py deleted file mode 100644 index 4316d6ab30e3..000000000000 --- a/tests/kernels/test_semi_structured.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests for sparse cutlass kernels - -Run `pytest tests/kernels/test_semi_structured.py`. -""" -from typing import Optional, Tuple, Type - -import pytest -import torch - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - sparse_cutlass_supported) -from vllm.platforms import current_platform - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -capability = current_platform.get_device_capability() -capability = capability[0] * 10 + capability[1] - - -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - -def rand_int8(shape: tuple, device: str = "cuda"): - return to_int8(torch.rand(shape, device=device) * 255 - 128) - - -def to_bf16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.bfloat16) - - -def to_fp16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.float16) - - -def prune_to_2_4(tensor): - # Reshape tensor to [N, 4] where N is number of groups of 4 - original_shape = tensor.shape - reshaped = tensor.reshape(-1, 4) - - # Get indices of top 2 absolute values in each group of 4 - _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) - - # Create binary mask - mask = torch.zeros_like(reshaped) - mask.scatter_(dim=1, - index=indices, - src=torch.ones_like(indices, dtype=mask.dtype)) - - # Apply mask and reshape back - pruned = reshaped * mask - - # Turn all -0.0 to 0.0 - pruned[pruned == -0.0] = 0.0 - - return pruned.reshape(original_shape) - - -def make_rand_sparse_tensors( - dtype: torch.dtype, m: int, n: int, k: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 - - b = prune_to_2_4(b.t()).t() - - if dtype == torch.int8: - a, b = to_int8(a), to_int8(b) - elif dtype == torch.float8_e4m3fn: - a, b = to_fp8(a), to_fp8(b) - elif dtype == torch.float16: - a, b = to_fp16(a), to_fp16(b) - elif dtype == torch.bfloat16: - a, b = to_bf16(a), to_bf16(b) - else: - raise ValueError("unsupported dtype") - - b_compressed, e = ops.cutlass_sparse_compress(b.t()) - - # Compressed B, Metadata, Original A, B - return b_compressed, e, a, b - - -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -# Test working with a subset of A and B for sparse matmul -def test_cutlass_sparse_subset(): - - big_m = 1024 - m, n, k = 512, 512, 512 - - # Create tensors - b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, - big_m, n, k) - a = whole_a[0:m, 0:k] - scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - - out = ops.cutlass_scaled_sparse_mm(a, - b_comp, - e, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - baseline = baseline_scaled_mm(a, - b, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - - torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index 8e96a2f70d75..a5aab3c2ea4b 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -39,6 +39,23 @@ def get_8bit_types(): return types +# This test is to check regressions for int8 support on ROCm. +@pytest.mark.parametrize("model_path", [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", +]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="Should only run on ROCm") +def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, + max_tokens, num_logprobs): + dtype = "bfloat16" + + with vllm_runner(model_path, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, + num_logprobs) + + @pytest.mark.parametrize("M", [1, 33, 64, 512]) @pytest.mark.parametrize("N", [256, 971, 20486]) @pytest.mark.parametrize("K", [128, 496, 1024]) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c9f29014fddc..09ab818ebd52 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -5,7 +5,7 @@ import unittest from numbers import Number from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, - Union) + Type, Union) import pytest import torch @@ -1107,3 +1107,28 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, kwargs, test_utils=test_utils, raise_exception=raise_exception) if cond else {} + + +# For testing quantized linear kernels +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.Tensor): + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def baseline_scaled_mm(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = (scale_a * (scale_b * (torch.mm( + a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) + if bias is not None: + output = output + bias + + return output diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 81b93ebdf0fc..ad8f8a0c320e 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -66,12 +66,16 @@ def gguf_model(self): gguf_filename="starcoder2-3b.Q6_K.gguf", ) +DOLPHIN_CONFIG = GGUFTestConfig( + # Test VocabParallelEmbedding sharding issue. + original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", + gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", + gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", +) + MODELS = [ - LLAMA_CONFIG, - QWEN2_CONFIG, - PHI3_CONFIG, - GPT2_CONFIG, - STABLELM_CONFIG, + LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, + DOLPHIN_CONFIG # STARCODER_CONFIG, # broken ] @@ -106,15 +110,18 @@ def test_models( messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=model.original_model, - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tp_size) as original_model: original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. with vllm_runner(model_name=model.gguf_model, + enforce_eager=True, tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d6d3d3b34ad4..fe5b733c750a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -35,7 +35,7 @@ def _test_processing_correctness( task="auto", tokenizer=model_id, tokenizer_mode="auto", - trust_remote_code=True, + trust_remote_code=model_info.trust_remote_code, seed=0, dtype="float16", revision=None, diff --git a/tests/models/registry.py b/tests/models/registry.py index e99dbd16c47b..0bd06dea0ec7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -261,7 +261,8 @@ def check_available_online( trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", + trust_remote_code=True), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 8456a463adee..b8524ed83026 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,8 @@ NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--worker-use-ray", + "--distributed-executor-backend", + "ray", "--gpu-memory-utilization", "0.85", "--swap-space", diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 9e58ed4cfde9..13f820d013e2 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,12 +7,16 @@ from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.multimodal.processing import (PlaceholderFeaturesInfo, + PromptReplacement, find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +# yapf: enable from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -433,19 +437,19 @@ def test_find_replace_tokens( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=6, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=3, - replacement=[32000], + tokens=[32000], ), ], } @@ -455,25 +459,25 @@ def test_find_replace_tokens( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=5, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=7, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], # No match for pattern_4 as it has lower priority than pattern_1 @@ -483,33 +487,33 @@ def test_find_replace_tokens( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=3, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=5, - replacement=[32000], + tokens=[32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=6, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], } diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 0cd86cef0a47..bf0d454ad511 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -313,8 +313,10 @@ def check_model(model): assert output +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") @pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") + reason="2of4 Sparse is not yet supported on this GPU type." + ) @pytest.mark.parametrize( "args_2of4", [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 88067f19c8f0..bf1ee6c39783 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -31,7 +31,7 @@ def test_random_sample_with_seed( sampling_params = SamplingParams( # Parameters to ensure sufficient randomness - temperature=2.0, + temperature=3.0, top_p=min(random.random() + 0.3, 1), top_k=random.randint(5, 20), n=random.randint(1, 10), @@ -75,3 +75,8 @@ def test_random_sample_with_seed( # verify requests with the same seed match assert outputs[1] == outputs[4] assert outputs[2] == outputs[5] + + # verify generations within the same parallel sampling group differ + for output in outputs: + for sub_output_a, sub_output_b in combinations(output, 2): + assert sub_output_a != sub_output_b diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index fe5fc979c66a..49a16d16eb84 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -100,32 +100,32 @@ def test_traces(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time # Model forward and model execute should be none, since detailed traces is # not enabled. assert metrics.model_forward_time is None @@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time assert metrics.model_forward_time > 0 assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( metrics.model_forward_time / 1000) assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE ) == metrics.model_execute_time assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index fafd9d0ce445..f434fa8c61a8 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -587,3 +587,72 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): assert {block.ref_cnt for block in block_part1[:3]} == {1} # Block 3-5 are free. assert {block.ref_cnt for block in block_part1[3:]} == {0} + + +def test_reset_prefix_cache(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + full_block_token_ids = [i for i in range(3) for _ in range(16)] + unique_token_ids = [3] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req0 = make_request("0", all_token_ids) + blocks = manager.allocate_slots(req0, 55, []) + assert [b.block_id for b in blocks] == [0, 1, 2, 3] + + unique_token_ids = [4] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req1 = make_request("1", all_token_ids) + computed_blocks, _ = manager.get_computed_blocks(req1) + assert len(req1.kv_block_hashes) == 3 + assert len(computed_blocks) == 3 + blocks = manager.allocate_slots(req1, 7, computed_blocks) + assert [b.block_id for b in blocks] == [4] + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not manager.reset_prefix_cache() + assert manager.cached_block_hash_to_block + + # Free the blocks. + manager.free(req0) + manager.free(req1) + + assert manager.reset_prefix_cache() + assert not manager.cached_block_hash_to_block + assert all([blk.block_hash is None for blk in manager.block_pool]) + + +def test_uncache_blocks(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + req0 = make_request("0", list(range(30))) + blocks = manager.allocate_slots(req0, 30, []) + assert [b.block_id for b in blocks] == [0, 1] + assert len(manager.cached_block_hash_to_block) == 1 + + req0.num_computed_tokens = 30 + + # Simulate speculative tokens. + for _ in range(5): + req0.append_output_token_ids(8) + manager.append_slots(req0, 5) + assert len(manager.cached_block_hash_to_block) == 2 + + # After sampling, assuming only 1 token is accepted. + req0.num_computed_tokens = 31 + num_uncached_blocks = manager.uncache_blocks(req0) + assert num_uncached_blocks == 1 + assert len(manager.cached_block_hash_to_block) == 1 diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 2c805e18eeba..10f783b21a9e 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,4 +1,5 @@ import asyncio +from contextlib import ExitStack from typing import List, Tuple import pytest @@ -6,6 +7,7 @@ from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.platforms import current_platform +from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM if not current_platform.is_cuda(): @@ -18,28 +20,39 @@ async def generate(engine: AsyncLLM, request_id: str, + output_kind: RequestOutputKind, max_tokens: int) -> Tuple[int, str]: count = 0 - async for _ in engine.generate(request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams( - max_tokens=max_tokens, temperature=0)): + sampling_params = SamplingParams(max_tokens=max_tokens, + output_kind=output_kind, + temperature=0) + async for out in engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=sampling_params): + + num_tokens = len(out.outputs[0].token_ids) + if output_kind == RequestOutputKind.DELTA: + count += num_tokens + else: + count = num_tokens - count += 1 await asyncio.sleep(0.) return count, request_id +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) @pytest.mark.asyncio -async def test_load(monkeypatch): +async def test_load(monkeypatch, output_kind: RequestOutputKind): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + after.callback(engine.shutdown) NUM_REQUESTS = 10000 NUM_EXPECTED_TOKENS = 10 @@ -51,26 +64,33 @@ async def test_load(monkeypatch): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS))) + generate(engine, request_id, output_kind, + NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. - for task in tasks: + done, pending = await asyncio.wait(tasks, + return_when=asyncio.FIRST_EXCEPTION) + for task in pending: + task.cancel() + for task in done: num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( f"{request_id} generated {num_generated_tokens} but " f"expected {NUM_EXPECTED_TOKENS}") assert not engine.output_processor.has_unfinished_requests() - engine.shutdown() +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) @pytest.mark.asyncio -async def test_abort(monkeypatch): +async def test_abort(monkeypatch, output_kind: RequestOutputKind): - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 100 @@ -83,7 +103,8 @@ async def test_abort(monkeypatch): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS))) + generate(engine, request_id, output_kind, + NUM_EXPECTED_TOKENS))) # API server cancels requests when they disconnect. for idx in REQUEST_IDS_TO_ABORT: @@ -108,9 +129,7 @@ async def test_abort(monkeypatch): # Confirm we can do another generation. request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}" task = asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS)) + generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS)) num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS assert not engine.output_processor.has_unfinished_requests() - - engine.shutdown() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index cccfd305ac60..033bbcfce564 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -144,7 +144,7 @@ def test_engine_core(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch): """ A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as min_tokens and + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch): stop_token_ids=[1001, 1002], ) engine_core.add_request(request) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: - pass - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 + def _check_engine_state(): + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + # Loop through until they are all done. + while len(engine_core.step().outputs) > 0: + pass + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + _check_engine_state() + + # Second request. + request2 = make_request() + request2.sampling_params = SamplingParams( + top_p=0.99, + top_k=50, + ) + engine_core.add_request(request2) + _check_engine_state() diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py new file mode 100644 index 000000000000..580392ac5f44 --- /dev/null +++ b/tests/v1/test_stats.py @@ -0,0 +1,300 @@ +import pytest + +from vllm.sampling_params import SamplingParams +from vllm.v1.stats.common import RequestStats, RequestStatsUpdate + + +def make_update( + request_id: str, + update_type: RequestStatsUpdate.Type, + monotonic_ts_s: float, + **kwargs, +): + if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED: + kwargs.setdefault("sampling_params", SamplingParams(n=1)) + kwargs.setdefault("num_prompt_tokens", 10) + elif update_type == RequestStatsUpdate.Type.PREFILLING: + kwargs.setdefault("num_computed_tokens", 10) + kwargs.setdefault("num_cached_tokens", 10) + elif update_type == RequestStatsUpdate.Type.DETOKENIZED: + kwargs.setdefault("num_new_tokens", 10) + elif update_type == RequestStatsUpdate.Type.FINISHED: + kwargs.setdefault("finish_reason", "test_reason") + + return RequestStatsUpdate( + request_id=request_id, + type=update_type, + monotonic_ts_s=monotonic_ts_s, + **kwargs, + ) + + +def test_invalid_request_update(): + request_id = "test_request" + update_specific_required_fields = { + RequestStatsUpdate.Type.INPUT_PROCESSED: [ + "sampling_params", + "num_prompt_tokens", + ], + RequestStatsUpdate.Type.PREFILLING: [ + "num_computed_tokens", + "num_cached_tokens", + ], + RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"], + RequestStatsUpdate.Type.FINISHED: ["finish_reason"], + } + + # Missing a required field should raise an assertion error. + for update_type in RequestStatsUpdate.Type: + required_fields = update_specific_required_fields.get(update_type, []) + + # Try to miss one of the required fields. + kwargs = {field: object() for field in required_fields} + for field in required_fields: + copy_kwargs = kwargs.copy() + copy_kwargs.pop(field) + with pytest.raises(ValueError): + RequestStatsUpdate( + request_id=request_id, + type=update_type, + **copy_kwargs, + ) + + +def test_invalid_request_update_transition(): + # Test invalid transition type. + for src in RequestStatsUpdate.Type: + for dst in RequestStatsUpdate.Type: + if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]: + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + update_type=dst, + request_id="test_request", + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + else: + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=dst, + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + + # Test invalid timestamp. + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=1, + ), + last_update_type=None, + last_updated_ts_s=2, + ) + + +def test_lifecycle_updates(): + request_id = "test_request" + stats = RequestStats(request_id=request_id) + + # Test the below scenario: + arrived_ts = 0 + input_processed_ts = 1 + queued_ts = 2 + prefilling_ts = 3 + decoded_ts = 5 + detokenized_ts = 6 + decoded_2_ts = 7 + detokenized_2_ts = 8 + preempted_ts = 9 + resumed_ts = 10 + decoded_3_ts = 11 + detokenized_3_ts = 12 + finished_ts = 13 + + # Test ARRIVED + arrived_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=arrived_ts, + ) + stats.update_from(arrived_update) + assert stats.arrival_ts_s == arrived_ts + assert stats.last_updated_ts_s == arrived_ts + + # Test INPUT_PROCESSED + sampling_params = SamplingParams(n=1) + input_processed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.INPUT_PROCESSED, + monotonic_ts_s=input_processed_ts, + sampling_params=sampling_params, + num_prompt_tokens=6, + ) + stats.update_from(input_processed_update) + assert stats.input_processor_end_ts_s == input_processed_ts + assert stats.last_updated_ts_s == input_processed_ts + assert stats.num_prompt_tokens == 6 + assert stats.sampling_params == sampling_params + + assert stats.first_token_ts_s is None + assert stats.prefill_ts_s is None + + # Test QUEUED + queued_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.QUEUED, + monotonic_ts_s=queued_ts, + ) + stats.update_from(queued_update) + assert stats.queued_ts_s == queued_ts + assert stats.last_updated_ts_s == queued_ts + + # Test PREFILLING + prefilling_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=prefilling_ts, + num_computed_tokens=3, + num_cached_tokens=1, + ) + stats.update_from(prefilling_update) + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 3 + assert stats.num_cached_tokens == 1 + assert stats.queue_duration_s == prefilling_ts - queued_ts + + # Test DECODING + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_ts, + ) + stats.update_from(decoded_update) + assert stats.last_updated_ts_s == decoded_ts + + # Test DETOKENIZED + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.last_updated_ts_s == detokenized_ts + assert stats.num_output_tokens == 1 + # Since arrival + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + # Since first scheduled + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + + # Test another DECODING and DETOKENIZED should + # yield correct inter token latency + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_2_ts, + ) + stats.update_from(decoded_update) + + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_2_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.num_output_tokens == 2 + + # Test PREEMPTED + preempted_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREEMPTED, + monotonic_ts_s=preempted_ts, + ) + stats.update_from(preempted_update) + assert stats.last_updated_ts_s == preempted_ts + assert stats.preempted_ts_s_lst == [preempted_ts] + # States should be reset + assert stats.num_computed_tokens == 0 + assert stats.num_cached_tokens == 0 + # These states should not be reset + assert stats.num_output_tokens == 2 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.prefill_latency_s == prefilling_ts - arrived_ts + assert stats.num_prompt_tokens == 6 + assert stats.prefill_start_ts_s_lst == [prefilling_ts] + + # Test resumed + resumed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=resumed_ts, + num_computed_tokens=6, + num_cached_tokens=2, + ) + stats.update_from(resumed_update) + # prefill timestamp should not be updated since it's a resumed prefill + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 6 + assert stats.num_cached_tokens == 2 + assert stats.prefill_start_ts_s_lst == [ + prefilling_ts, + resumed_ts, + ] + assert stats.last_updated_ts_s == resumed_ts + + # Test another DECODED/DETOKENIZED should yield correct first token latency. + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_3_ts, + ) + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_3_ts, + num_new_tokens=1, + ) + stats.update_from(decoded_update) + stats.update_from(detokenized_update) + assert stats.first_token_ts_s == detokenized_ts - arrived_ts + assert stats.num_output_tokens == 3 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + detokenized_3_ts - detokenized_2_ts, + ] + + # Test FINISHED + finished_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.FINISHED, + monotonic_ts_s=finished_ts, + finish_reason="test_reason", + ) + stats.update_from(finished_update) + assert stats.last_updated_ts_s == finished_ts + assert stats.e2e_latency_s == finished_ts - arrived_ts + assert stats.inference_latency_s == finished_ts - prefilling_ts + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + assert stats.decode_latency_s == finished_ts - detokenized_ts + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + assert stats.queue_duration_s == prefilling_ts - queued_ts + assert stats.is_finished + assert stats.finish_reason == "test_reason" + + # TODO(rickyx): Add model forward/execute time. + assert stats.model_forward_duration_s == 0.0 + assert stats.model_execute_duration_s == 0.0 diff --git a/vllm/assets/image.py b/vllm/assets/image.py index cb831cb0b5bb..0a55506f8825 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -26,4 +26,4 @@ def image_embeds(self) -> torch.Tensor: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path, map_location="cpu") + return torch.load(image_path, map_location="cpu", weights_only=True) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 5008e1982109..fe097b346f3c 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -65,11 +65,6 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": def get_builder_cls() -> Type["AttentionMetadataBuilder"]: raise NotImplementedError - @classmethod - def make_metadata_builder(cls, *args, - **kwargs) -> "AttentionMetadataBuilder": - return cls.get_builder_cls()(*args, **kwargs) - @staticmethod @abstractmethod def get_kv_cache_shape( @@ -218,6 +213,12 @@ class AttentionMetadataBuilder(ABC, Generic[T]): @abstractmethod def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None: + """Create the builder, remember some configuration and parameters.""" + raise NotImplementedError + + @abstractmethod + def prepare(self) -> None: + """Prepare for one batch.""" raise NotImplementedError @abstractmethod @@ -231,6 +232,8 @@ class AttentionLayer(Protocol): _k_scale: torch.Tensor _v_scale: torch.Tensor + _k_scale_float: torch.Tensor + _v_scale_float: torch.Tensor _q_scale: torch.Tensor _prob_scale: torch.Tensor diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py old mode 100644 new mode 100755 index 9bee0d3789cc..8bfb5e461d6f --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -17,15 +17,21 @@ compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set, is_block_tables_empty) +from vllm.envs import VLLM_FLASH_ATTN_VERSION +from vllm.logger import init_logger from vllm.multimodal import MultiModalPlaceholderMap +from vllm.platforms import current_platform from vllm.utils import async_tensor_h2d, make_tensor_with_pad +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + flash_attn_with_kvcache, + is_fa_version_supported) if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) +logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): @@ -377,6 +383,12 @@ class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -390,11 +402,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool, prefix_cache_hit: bool): @@ -636,6 +643,25 @@ def __init__( f"Supported head sizes are: {support_head_sizes}.") self.attn_type = attn_type + # if hopper default to FA3, otherwise stick to FA2 for now + # TODO(lucas): profile FA3 on ampere to see if it makes sense to + # use FA3 as default for both + if current_platform.get_device_capability()[0] >= 9: + self.fa_version = 3 if is_fa_version_supported(3) else 2 + else: + self.fa_version = 2 + + if VLLM_FLASH_ATTN_VERSION is not None: + assert VLLM_FLASH_ATTN_VERSION in [2, 3] + self.fa_version = VLLM_FLASH_ATTN_VERSION + + if not is_fa_version_supported(self.fa_version): + logger.error("Cannot use FA version %d is not supported due to %s", + self.fa_version, + fa_version_unsupported_reason(self.fa_version)) + + assert is_fa_version_supported(self.fa_version) + def forward( self, layer: AttentionLayer, @@ -661,7 +687,7 @@ def forward( NOTE: It in-place updates the output tensor. """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0, ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -755,6 +781,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) else: # prefix-enabled attention @@ -768,7 +795,7 @@ def forward( v=value_cache, cu_seqlens_q=prefill_meta.query_start_loc, max_seqlen_q=prefill_meta.max_query_len, - cu_seqlens_k=prefill_meta.seq_start_loc, + seqused_k=prefill_meta.seq_lens_tensor, max_seqlen_k=max_seq_len, softmax_scale=softmax_scale, causal=True, @@ -777,6 +804,7 @@ def forward( block_table=prefill_meta.block_tables, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) if decode_meta := attn_metadata.decode_metadata: @@ -796,7 +824,7 @@ def forward( v=value_cache, cu_seqlens_q=decode_meta.query_start_loc, max_seqlen_q=decode_meta.max_decode_query_len, - cu_seqlens_k=decode_meta.seq_start_loc, + seqused_k=decode_meta.seq_lens_tensor, max_seqlen_k=decode_meta.max_decode_seq_len, softmax_scale=softmax_scale, causal=True, @@ -805,6 +833,7 @@ def forward( softcap=logits_soft_cap, block_table=decode_meta.block_tables, out=decode_output, + fa_version=self.fa_version, ) else: # Use flash_attn_with_kvcache for normal decoding. @@ -825,6 +854,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=decode_output.unsqueeze(1), + fa_version=self.fa_version, ) return output diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 582780046155..6c10e23dc434 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,4 @@ +import dataclasses from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -13,9 +14,11 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 except ImportError: - BatchDecodeWithPagedKVCacheWrapper = None - CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None - BatchPrefillWithPagedKVCacheWrapper = None + # Avoid turning these types into variables during type checking + if not TYPE_CHECKING: + BatchDecodeWithPagedKVCacheWrapper = None + CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None + BatchPrefillWithPagedKVCacheWrapper = None FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 import torch @@ -30,7 +33,9 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) +from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig, get_current_vllm_config from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) @@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") +@dataclass +class PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters. + """ + + window_left: int + logits_soft_cap: Optional[float] + sm_scale: float + + +def get_per_layer_parameters( + vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: + """ + Scan all attention layers and determine some hyperparameters + to use during `plan`. + """ + + layers = vllm_config.compilation_config.static_forward_context + per_layer_params: Dict[str, PerLayerParameters] = {} + + for key, layer in layers.items(): + assert isinstance(layer, Attention) + + impl = layer.impl + assert isinstance(impl, FlashInferImpl) + + # Infer hyperparameters from the attention layer + window_size = impl.sliding_window + window_left = window_size[0] if window_size is not None else -1 + logits_soft_cap = impl.logits_soft_cap + sm_scale = impl.scale + + per_layer_params[key] = PerLayerParameters(window_left, + logits_soft_cap, sm_scale) + + return per_layer_params + + +def infer_global_hyperparameters( + per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters: + - `window_left` + - `logits_soft_cap` + - `sm_scale` + + So this function asserts that all layers share the same values for these + hyperparameters and returns the global values. + """ + + assert len(per_layer_params) > 0, "No attention layers found in the model." + + param_sets = list(per_layer_params.values()) + global_params = param_sets[0] + for params in param_sets: + assert params == global_params, ( + "FlashInfer backend currently only supports models in which all " + "layers share the same values for the following hyperparameters: " + "`window_left`, `logits_soft_cap`, `sm_scale`.") + + return global_params + + class FlashInferState(AttentionState): def __init__(self, runner): @@ -108,6 +179,11 @@ def __init__(self, runner): self._decode_wrapper = None self._prefill_wrapper = None + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def _get_workspace_buffer(self): if self._workspace_buffer is None: self._workspace_buffer = torch.empty( @@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch( batch_size + 1, dtype=torch.int32) + global_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, slot_mapping=self._graph_slot_mapping[:batch_size], @@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch( q_data_type=self.runner.model_config.dtype, use_cuda_graph=True, decode_wrapper=self._graph_decode_wrapper, - prefill_wrapper=None) + prefill_wrapper=None, + **dataclasses.asdict(global_params), + ) attn_metadata.begin_forward() return attn_metadata @@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata): data_type: torch.dtype = None # The data type of the query q_data_type: torch.dtype = None - device: torch.device = torch.device("cuda") + # FlashInfer 0.2 encourages passing host tensors + device: torch.device = torch.device("cpu") is_profile_run: bool = False + # The FlashInfer backend currently supports only models in which all layers + # share the same following hyperparameters: + + # The left (inclusive) window size for the attention window, when + # set to `-1`, the window size will be set to the full length of + # the sequence. Defaults to `-1`. + window_left: int = -1 + # The attention logits soft capping value (used in Gemini, Grok and + # Gemma-2, etc.), if not provided, will be set to `0`. If greater + # than 0, the logits will be capped according to formula: + # $$\texttt{logits\_soft\_cap} \times + # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$, + # where $x$ is the input logits. + logits_soft_cap: Optional[float] = None + # The scale used in softmax, if not provided, will be set to + # `1.0 / sqrt(head_dim)`. + sm_scale: Optional[float] = None + def __post_init__(self): # Refer to # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157 @@ -363,14 +463,21 @@ def begin_forward(self): self.block_table_bound = self.block_table_bound.to(self.device) self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) self.paged_kv_indices = self.paged_kv_indices.to(self.device) - self.prefill_wrapper.end_forward() - self.prefill_wrapper.begin_forward( + self.prefill_wrapper.plan( self.query_start_loc, self.paged_kv_indptr[:self.num_prefills + 1], self.paged_kv_indices, self.paged_kv_last_page_len[:self.num_prefills], - self.num_qo_heads, self.num_kv_heads, self.head_dim, - self.page_size) + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.data_type) if self.num_decode_tokens > 0: assert self.paged_kv_indices is not None assert self.paged_kv_indptr is not None @@ -386,8 +493,7 @@ def begin_forward(self): self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) assert self.decode_wrapper is not None - self.decode_wrapper.end_forward() - self.decode_wrapper.begin_forward( + self.decode_wrapper.plan( self.paged_kv_indptr[self.num_prefills:], self.paged_kv_indices, self.paged_kv_last_page_len[self.num_prefills:], @@ -397,8 +503,11 @@ def begin_forward(self): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, # kv-cache data type. - data_type=self.data_type, + kv_data_type=self.data_type, # query data type. q_data_type=self.q_data_type) @@ -489,6 +598,19 @@ def advance_step(self, class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -501,12 +623,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout # for the precise definition of the following fields. # An example: @@ -526,6 +642,20 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.total_blocks = 0 self.is_profile_run: bool = False + if self.global_hyperparameters is None: + # Infer global hyperparameters, since currently we only support + # models in which all layers share the same values for the + # following hyperparameters: + # - `window_left` + # - `logits_soft_cap` + # - `sm_scale` + inferred_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + self.global_hyperparameters = inferred_params + self.window_left = inferred_params.window_left + self.logits_soft_cap = inferred_params.logits_soft_cap + self.sm_scale = inferred_params.sm_scale + def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -754,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], data_type=kv_cache_dtype, q_data_type=self.runner.model_config.dtype, use_cuda_graph=use_captured_graph, - is_profile_run=self.is_profile_run) + is_profile_run=self.is_profile_run, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, + ) class FlashInferImpl(AttentionImpl): @@ -884,25 +1018,34 @@ def forward( else: assert prefill_meta is not None assert prefill_meta.prefill_wrapper is not None - prefill_output = prefill_meta.prefill_wrapper.forward( + + assert prefill_meta.prefill_wrapper._causal + assert prefill_meta.prefill_wrapper._window_left == window_left + assert prefill_meta.prefill_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale + + prefill_output = prefill_meta.prefill_wrapper.run( query, kv_cache, - logits_soft_cap=logits_soft_cap, - causal=True, - k_scale=layer._k_scale, - v_scale=layer._v_scale, - window_left=window_left) + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + ) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None assert decode_meta.decode_wrapper is not None - decode_output = decode_meta.decode_wrapper.forward( + + assert decode_meta.decode_wrapper._window_left == window_left + assert decode_meta.decode_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert decode_meta.decode_wrapper._sm_scale == softmax_scale + + decode_output = decode_meta.decode_wrapper.run( decode_query, kv_cache, - sm_scale=softmax_scale, - logits_soft_cap=logits_soft_cap, - k_scale=layer._k_scale, - v_scale=layer._v_scale, - window_left=window_left) + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + ) if prefill_output is None and decode_output is not None: # Decode only batch. diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index f267796ed456..c060d2f6772b 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -194,7 +194,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert layer._k_scale == 1.0 and layer._v_scale == 1.0 + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 8aa2a3667e56..1c902d394464 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -174,7 +174,7 @@ def forward( Returns: shape = [batch_size, seq_len, num_heads * head_size] """ - assert layer._k_scale == 1.0 and layer._v_scale == 1.0 + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index d2dc0d6cf0a5..826311896d1d 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -255,6 +255,11 @@ class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + def prepare(self): self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] @@ -265,9 +270,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 972cae99284a..4ae89617d4a5 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -282,7 +282,10 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: self.chunked_prefill = input_builder.chunked_prefill - self.input_data = input_builder.input_data + self.input_builder = input_builder + + def prepare(self): + self.input_data = self.input_builder.input_data def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: @@ -453,7 +456,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert layer._k_scale == 1.0 and layer._v_scale == 1.0 attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index d7d4a5166975..bee50f38df4c 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -123,6 +123,13 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): _metadata_cls: Type[TAttentionMetadata] def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -135,12 +142,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index ac7c5e25d43a..4cd43947f211 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: # Compute some attn_metadata fields which default to None query_start_loc = (None if self.query_start_loc is None else self.query_start_loc[:self.num_prefills + 1]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) slot_mapping = (None if self.slot_mapping is None else self.slot_mapping[:self.num_prefill_tokens]) seq_lens = (None if self.seq_lens is None else @@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: max_prefill_seq_len=self.max_prefill_seq_len, max_decode_seq_len=0, query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index bbff9d35ed84..c24d8657964d 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -78,6 +78,12 @@ def __init__( self._v_scale = torch.tensor(1.0, dtype=torch.float32) self._q_scale = torch.tensor(1.0, dtype=torch.float32) self._prob_scale = torch.tensor(1.0, dtype=torch.float32) + + # We also keep the float32 versions of k/v_scale for attention + # backends that don't support tensors (Flashinfer) + self._k_scale_float = torch.tensor(1.0, dtype=torch.float32) + self._v_scale_float = torch.tensor(1.0, dtype=torch.float32) + quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None: @@ -111,11 +117,11 @@ def __init__( self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype - # For cuda and cpu platforms, we control how + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant # opaque custom op. For other platforms, we directly call them # and let torch.compile handle them. - self.use_direct_call = not current_platform.is_cuda( + self.use_direct_call = not current_platform.is_cuda_alike( ) and not current_platform.is_cpu() self.use_output = attn_backend.accept_output_buffer @@ -180,6 +186,8 @@ def calc_kv_scales(self, query, key, value): self._q_scale.copy_(torch.abs(query).max() / self.q_range) self._k_scale.copy_(torch.abs(key).max() / self.k_range) self._v_scale.copy_(torch.abs(value).max() / self.v_range) + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() # We only calculate the scales once self.calculate_kv_scales = False @@ -208,6 +216,9 @@ def __init__( self.scale = scale self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + dtype = torch.get_default_dtype() attn_backend = get_attn_backend(head_size, dtype, @@ -219,7 +230,8 @@ def __init__( backend = _Backend.XFORMERS self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, _Backend.XFORMERS + _Backend.TORCH_SDPA, + _Backend.XFORMERS, } else _Backend.TORCH_SDPA def forward( @@ -229,7 +241,7 @@ def forward( value: torch.Tensor, ) -> torch.Tensor: """Input shape: batch_size x seq_len x hidden_size""" - # TODO(Isotr0py): Use existing backend implementations and support FA2 + # TODO(Isotr0py): Use existing backend implementations and support FA3 bsz, q_len, _ = query.size() kv_len = key.size(1) @@ -237,6 +249,11 @@ def forward( key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + if (num_repeat := self.num_queries_per_kv) > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_repeat, dim=2) + value = torch.repeat_interleave(value, num_repeat, dim=2) + if self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index cbc6c74acf09..3a07184ed31f 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -52,8 +52,8 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: ops.reshape_and_cache( @@ -80,8 +80,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: tp_rank: int = 0 @@ -149,8 +149,8 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: ipex_modules.PagedAttention.reshape_and_cache( @@ -170,8 +170,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: block_size = value_cache.shape[2] diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 65e6aaedfb73..58ed82f3ba92 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -206,8 +206,8 @@ def forward_prefix( max_query_len: int, alibi_slopes: Optional[torch.Tensor], sliding_window: Optional[int], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> torch.Tensor: output = torch.empty_like(query) context_attention_fwd( diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index e8d6854c043a..77f92999232c 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -564,7 +564,7 @@ def _fwd_kernel_alibi( other=0.0) # [D,N] if k_load.dtype.is_fp8(): - k = (k_load.to(tl.float32) * k_scale).to(q.dtype) + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) else: k = k_load @@ -604,7 +604,7 @@ def _fwd_kernel_alibi( ((start_n + offs_n[:, None]) < cur_batch_ctx_len), other=0.0) if v_load.dtype.is_fp8(): - v = (v_load.to(tl.float32) * v_scale).to(q.dtype) + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) else: v = v_load p = p.to(v.dtype) @@ -713,8 +713,8 @@ def context_attention_fwd(q, b_seq_len, b_ctx_len, max_input_len, - k_scale: float = 1.0, - v_scale: float = 1.0, + k_scale: torch.Tensor, + v_scale: torch.Tensor, alibi_slopes=None, sliding_window=None): diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 955c25f30051..7f4f97466d50 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -524,6 +524,7 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + vllm_config = self.vllm_config if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, @@ -532,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # 1. factors come from the vllm_config (it mainly summarizes how the # model is created) - vllm_config = self.vllm_config config_hash = vllm_config.compute_hash() # 2. factors come from the code files that are traced by Dynamo ( @@ -556,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: hash_key = hashlib.md5( f"{config_hash}_{code_hash}".encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, - f"rank_{vllm_config.parallel_config.rank}") - else: - cache_dir = self.compilation_config.cache_dir + envs.VLLM_CACHE_ROOT, + "torch_compile_cache", + hash_key, + ) + self.compilation_config.cache_dir = cache_dir + + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) + local_cache_dir = os.path.join( + cache_dir, f"rank_{vllm_config.parallel_config.rank}") + self.compilation_config.local_cache_dir = local_cache_dir disabled = envs.VLLM_DISABLE_COMPILE_CACHE self.inductor_hash_cache: InductorHashCache = InductorHashCache( - cache_dir, disabled=disabled) + local_cache_dir, disabled=disabled) if disabled: logger.info("vLLM's torch.compile cache is disabled.") else: logger.info("Using cache directory: %s for vLLM's torch.compile", - cache_dir) + local_cache_dir) # when dynamo calls the backend, it means the bytecode # transform and analysis are done @@ -609,6 +615,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.vllm_config, self.graph_pool, self).run(*example_inputs) + graph_path = os.path.join(local_cache_dir, "computation_graph.py") + if not os.path.exists(graph_path): + # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa + # use `print_readable` because it can include submodules + src = "from __future__ import annotations\nimport torch\n" + \ + self.split_gm.print_readable(print_output=False) + src = src.replace("", "GraphModule") + with open(graph_path, "w") as f: + f.write(src) + + logger.debug("Computation graph saved to %s", graph_path) + self._called = True if not self.compilation_config.use_cudagraph or \ @@ -662,7 +680,7 @@ def copy_and_call(*args): class ConcreteSizeEntry: runtime_shape: int need_to_compile: bool # the size is in compile_sizes - use_cudagraph: bool # the size is in capture_sizes + use_cudagraph: bool # the size is in cudagraph_capture_sizes compiled: bool = False runnable: Callable = None # type: ignore @@ -709,8 +727,8 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.compile_sizes: Set[int] = set( self.compilation_config.compile_sizes) - self.capture_sizes: Set[int] = set( - self.compilation_config.capture_sizes + self.cudagraph_capture_sizes: Set[int] = set( + self.compilation_config.cudagraph_capture_sizes ) if self.compilation_config.use_cudagraph else set() self.first_run_finished = False @@ -728,11 +746,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, # to_be_compiled_sizes tracks the remaining sizes to compile, # and updates during the compilation process, so we need to copy it self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.capture_sizes): + for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, need_to_compile=shape in self.compile_sizes, - use_cudagraph=shape in self.capture_sizes, + use_cudagraph=shape in self.cudagraph_capture_sizes, ) def check_for_ending_compilation(self): diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 38f284794b8d..17eb0592ced6 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -198,6 +198,8 @@ def __call__(self, *args, **kwargs): f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) + logger.debug("Start compiling function %s", + self.original_code_object) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index e3260a10c02a..58a8fa76f6ce 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -9,6 +9,9 @@ import vllm.envs as envs from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.logger import init_logger + +logger = init_logger(__name__) class TorchCompileWrapperWithCustomDispatcher: @@ -82,6 +85,25 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): return self.compiled_codes.append(new_code) + local_cache_dir = self.vllm_config.compilation_config.local_cache_dir + if isinstance(local_cache_dir, str): + decompiled_file = os.path.join(local_cache_dir, + "transformed_code.py") + if not os.path.exists(decompiled_file): + try: + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + src = depyf.decompile(new_code) + with open(decompiled_file, "w") as f: + f.write(src) + + logger.debug("Dynamo transformed code saved to %s", + decompiled_file) + except Exception: + pass if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: diff --git a/vllm/config.py b/vllm/config.py index cbe79c337565..7f05532d7219 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -190,39 +190,42 @@ def compute_hash(self) -> str: factors.append(self.rope_theta) return hashlib.sha256(str(factors).encode()).hexdigest() - def __init__(self, - model: str, - task: Union[TaskOption, Literal["draft"]], - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - allowed_local_media_path: str = "", - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - config_format: ConfigFormat = ConfigFormat.AUTO, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - disable_mm_preprocessor_cache: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, - logits_processor_pattern: Optional[str] = None, - generation_config: Optional[str] = None) -> None: + def __init__( + self, + model: str, + task: Union[TaskOption, Literal["draft"]], + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + allowed_local_media_path: str = "", + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + config_format: ConfigFormat = ConfigFormat.AUTO, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + disable_mm_preprocessor_cache: bool = False, + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None, + logits_processor_pattern: Optional[str] = None, + generation_config: Optional[str] = None, + enable_sleep_mode: bool = False, + ) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -270,6 +273,12 @@ def __init__(self, self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init + self.enable_sleep_mode = enable_sleep_mode + + from vllm.platforms import current_platform + + if self.enable_sleep_mode and not current_platform.is_cuda(): + raise ValueError("Sleep mode is only supported on CUDA devices.") hf_config = get_config(self.model, trust_remote_code, revision, code_revision, config_format) @@ -302,14 +311,15 @@ def __init__(self, (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): - if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": + if (backend := + envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " - "XFORMERS backend. Disabling sliding window and capping " + f"{backend} backend. Disabling sliding window and capping " "the max length to the sliding window size " f"({sliding_window_len_min}).") self.disable_sliding_window = True @@ -341,7 +351,6 @@ def __init__(self, self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() - from vllm.platforms import current_platform if current_platform.is_neuron(): self.override_neuron_config = override_neuron_config else: @@ -602,7 +611,7 @@ def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama'] if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH and not self.enforce_eager): logger.warning( @@ -905,12 +914,18 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: "top_k", "top_p", "min_p", + "max_new_tokens", ] if any(p in config for p in available_params): diff_sampling_param = { p: config.get(p) for p in available_params if config.get(p) is not None } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") else: diff_sampling_param = {} return diff_sampling_param @@ -1222,9 +1237,6 @@ class ParallelConfig: pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. tensor_parallel_size: int = 1 # Number of tensor parallel groups. - # Deprecated, use distributed_executor_backend instead. - worker_use_ray: Optional[bool] = None - # Maximum number of multiple batches # when load model sequentially. To avoid RAM OOM when using tensor # parallel and large models. @@ -1278,14 +1290,7 @@ def __post_init__(self) -> None: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size - if self.worker_use_ray: - if self.distributed_executor_backend is None: - self.distributed_executor_backend = "ray" - elif not self.use_ray: - raise ValueError(f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") - ray_only_devices = ["tpu", "hpu"] + ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): @@ -2727,10 +2732,11 @@ class CompilationConfig(BaseModel): - use_inductor: whether to use inductor compilation. - False: inductor compilation is not used. graph runs in eager. - True: inductor compilation is used. one graph for symbolic shape - is compiled. In addition, compile for cudagraph sizes that are - in candidate_compile_sizes, using configurations - in inductor_compile_config. - - candidate_compile_sizes: sizes to compile for inductor. + is compiled. In addition, compile for compile_sizes, + using configurations in inductor_compile_config. + - compile_sizes: sizes to compile for inductor. In addition + to integers, it also supports "cudagraph_capture_sizes" to + specify the sizes for cudagraph capture. - inductor_compile_config: additional configurations for inductor. - None: use default configurations. - inductor_passes: additional passes for inductor. It is a dictionary @@ -2758,7 +2764,7 @@ class CompilationConfig(BaseModel): splitting_ops: List[str] = Field(default=None) # type: ignore use_inductor: bool = True - candidate_compile_sizes: Optional[List[int]] = Field(default=None) + compile_sizes: Optional[List[Union[int, str]]] = Field(default=None) inductor_compile_config: Dict = Field(default_factory=dict) inductor_passes: Dict[str, str] = Field(default_factory=dict) @@ -2806,9 +2812,8 @@ def model_post_init(self, __context: Any) -> None: pass_config: PassConfig = Field(default_factory=PassConfig) # not configurable, computed after init - compile_sizes: List[int] = PrivateAttr - capture_sizes: List[int] = PrivateAttr max_capture_size: int = PrivateAttr + local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. # since we know all keys are in a range [0, max_capture_size], @@ -2933,43 +2938,47 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: from vllm.compilation.backends import VllmBackend return VllmBackend(vllm_config) - def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): + def init_with_cudagraph_sizes(self, + cudagraph_capture_sizes: List[int]) -> None: """To complete the initialization of config, we need to know the cudagraph sizes.""" if self.cudagraph_capture_sizes is None: - self.capture_sizes = sizes_to_specialize + self.cudagraph_capture_sizes = cudagraph_capture_sizes else: - self.capture_sizes = self.cudagraph_capture_sizes + # de-duplicate the sizes provided by the config + self.cudagraph_capture_sizes = list( + set(self.cudagraph_capture_sizes)) logger.info(("cudagraph sizes specified by model runner" " %s is overridden by config %s"), - sizes_to_specialize, self.cudagraph_capture_sizes) - - if self.candidate_compile_sizes is None: - self.candidate_compile_sizes = [] - self.compile_sizes = [ - x for x in self.candidate_compile_sizes if x in self.capture_sizes - ] - ignored_sizes = [ - x for x in self.candidate_compile_sizes - if x not in self.capture_sizes - ] - if ignored_sizes: - logger.warning(("candidate_compile_sizes %s are ignored " - "because they are not cudagraph capture sizes."), - ignored_sizes) + cudagraph_capture_sizes, self.cudagraph_capture_sizes) + + computed_compile_sizes = [] + if self.compile_sizes is not None: + # de-duplicate the sizes provided by the config + self.compile_sizes = list(set(self.compile_sizes)) + for x in self.compile_sizes: + if isinstance(x, str): + assert x == "cudagraph_capture_sizes", \ + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" + computed_compile_sizes.extend(self.cudagraph_capture_sizes) + else: + assert isinstance(x, int) + computed_compile_sizes.append(x) + self.compile_sizes = computed_compile_sizes # type: ignore # sort to make sure cudagraph capture sizes are in descending order - self.capture_sizes.sort(reverse=True) - self.max_capture_size = self.capture_sizes[ - 0] if self.capture_sizes else 0 + self.cudagraph_capture_sizes.sort(reverse=True) + self.max_capture_size = self.cudagraph_capture_sizes[ + 0] if self.cudagraph_capture_sizes else 0 # pre-compute the mapping from batch size to padded graph size self.bs_to_padded_graph_size = [ 0 for i in range(self.max_capture_size + 1) ] - for end, start in zip(self.capture_sizes, - self.capture_sizes[1:] + [0]): + for end, start in zip(self.cudagraph_capture_sizes, + self.cudagraph_capture_sizes[1:] + [0]): for bs in range(start, end): if bs == start: self.bs_to_padded_graph_size[bs] = start @@ -3240,14 +3249,14 @@ def _set_cudagraph_sizes(self): However, if users specify the cudagraph capture sizes through compilation config, we will use the specified sizes instead. - In the end, `vllm_config.compilation_config.capture_sizes` will be the - final sizes to capture cudagraph (in descending order). + In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` + will be the final sizes to capture cudagraph (in descending order). During runtime, if batchsize is larger than - `vllm_config.compilation_config.capture_sizes`, + `vllm_config.compilation_config.cudagraph_capture_sizes`, no cudagraph will be used. If the batch size is no larger than - `vllm_config.compilation_config.capture_sizes`, + `vllm_config.compilation_config.cudagraph_capture_sizes`, we can quickly find the padded graph size for a given batch size by looking up `vllm_config.compilation_config.bs_to_padded_graph_size`. """ @@ -3329,7 +3338,7 @@ def __str__(self): @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig): +def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): """ Temporarily set the current VLLM config. Used during model initialization. @@ -3349,7 +3358,8 @@ def set_current_vllm_config(vllm_config: VllmConfig): vllm_config.compilation_config.enabled_custom_ops) logger.debug("disabled custom ops: %s", vllm_config.compilation_config.disabled_custom_ops) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + if check_compile and \ + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: # If the model supports compilation, # compilation_counter.num_models_seen should be increased diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 3a57487a6cd8..c3e1665b4464 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -339,6 +339,13 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: assert device in self._allocators return self._allocators[device].get_prefix_cache_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + success = True + for allocator in self._allocators.values(): + success = success and allocator.reset_prefix_cache() + return success + def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 985a1098b6cd..cb432db919c7 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -192,6 +192,11 @@ def get_prefix_cache_hit_rate(self) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + class NoFreeBlocksError(ValueError): pass @@ -297,6 +302,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + @abstractmethod def find_cached_blocks_prefix( self, diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 9b94918ab38e..c38ae2dd6761 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,5 +1,5 @@ from collections import deque -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple +from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -136,16 +136,18 @@ def _allocate_block_id(self) -> BlockId: self._refcounter.incr(block_id) return block_id - def _free_block_id(self, block: Block) -> None: - block_id = block.block_id + def _free_block_id(self, block: Union[Block, BlockId]) -> None: + if isinstance(block, Block): + block_id = block.block_id + block.block_id = None + else: + block_id = block assert block_id is not None refcount = self._refcounter.decr(block_id) if refcount == 0: self._free_block_indices.appendleft(block_id) - block.block_id = None - def free(self, block: Block, keep_block_object: bool = False) -> None: # Release the physical block id self._free_block_id(block) @@ -154,6 +156,9 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: if not keep_block_object: self._block_pool.free_block(block) + def free_block_id(self, block_id: BlockId) -> None: + self._free_block_id(block_id) + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -325,6 +330,10 @@ def swap_in(self, blocks: List[Block]) -> None: def get_prefix_cache_hit_rate(self) -> float: return -1 + def reset_prefix_cache(self) -> bool: + """No prefix cache for naive block allocator.""" + return True + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: # Not applicable for naive block allocator. return [] diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1238303234de..ccdc5daa9595 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -12,6 +12,7 @@ from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor +from vllm.logger import init_logger from vllm.sequence import Sequence PrefixHash = int @@ -21,6 +22,8 @@ # then we know this block hasn't been accessed yet. _DEFAULT_LAST_ACCESSED_TIME = -1 +logger = init_logger(__name__) + class BlockTracker: """Used to track the status of a block inside the prefix caching allocator @@ -105,7 +108,8 @@ def __init__( # Evitor used to maintain how we want to handle those computed blocks # if we find memory pressure is high. - self.evictor: Evictor = make_evictor(eviction_policy) + self.eviction_policy = eviction_policy + self.evictor: Evictor = make_evictor(self.eviction_policy) # We share the refcounter between allocators. This allows us to promote # blocks originally allocated in the hashless allocator to immutable @@ -428,6 +432,44 @@ def all_block_ids(self) -> FrozenSet[int]: def get_prefix_cache_hit_rate(self) -> float: return self.metric_data.get_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.get_num_total_blocks() - + self.get_num_free_blocks()) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Free all blocks in the evictor. + while (block_id := + self._maybe_allocate_evicted_block_id()) is not None: + self._hashless_allocator.free_block_id(block_id) + + # Should not have any cached blocks because all blocks are evicted. + assert not self._cached_blocks + + # Reset the evictor. + self.evictor = make_evictor(self.eviction_policy) + + # Reset the block tracker. + for block_id in self._block_tracker: + self._block_tracker[block_id] = BlockTracker() + + # Reset the metrics. + self.metric_data = CacheMetricData() + + logger.info("Successfully reset prefix cache") + return True + def is_block_cached(self, block: Block) -> bool: assert block.content_hash is not None return block.content_hash in self._cached_blocks diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b41e84822188..62a5f0bda061 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -455,6 +455,9 @@ def get_num_free_cpu_blocks(self) -> int: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_allocator.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_allocator.reset_prefix_cache() + def _can_swap(self, seq_group: SequenceGroup, device: Device, diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index b10b8d3f4a5b..9c7e246e3c4e 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -122,6 +122,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + pass + @abstractmethod def get_num_cached_tokens(self, seq: Sequence) -> int: pass diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index a47e59451853..f9924be4a383 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -90,5 +90,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, def get_prefix_cache_hit_rate(self, device: Device) -> float: return -1 + def reset_prefix_cache(self) -> bool: + return True + def get_num_cached_tokens(self, seq: Sequence) -> int: return 0 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b3d396f9cedd..b1630b34947b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -504,6 +504,9 @@ def has_unfinished_seqs(self) -> bool: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_manager.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_manager.reset_prefix_cache() + def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) diff --git a/vllm/device_allocator/__init__.py b/vllm/device_allocator/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py new file mode 100644 index 000000000000..a43418dbb3b4 --- /dev/null +++ b/vllm/device_allocator/cumem.py @@ -0,0 +1,254 @@ +# cumem-based pytorch pluggable allocator to implement sleep mode. +# other approaches tried but failed: +# - cuda-python package binding +# - custom libcuda driver ctypes wrapper +# both of them failed because of cuda context mismatch. +# not sure why, they are created from a different context. +# the only successful approach is to call cuda driver API in C. +import dataclasses +from contextlib import contextmanager +from typing import Callable, Dict, Optional, Tuple, Union + +import torch + +from vllm.utils import is_pin_memory_available + + +def find_loaded_library(lib_name) -> Optional[str]: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found_line = None + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found_line = line + break + if found_line is None: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = found_line.index("/") + path = found_line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" + return path + + +cumem_available = False +try: + from vllm.cumem_allocator import (init_module, python_create_and_map, + python_unmap_and_release) + from vllm.distributed.device_communicators.cuda_wrapper import ( + CudaRTLibrary) + lib_name = find_loaded_library("cumem_allocator") + libcudart = CudaRTLibrary() + cumem_available = True +except ModuleNotFoundError: + # rocm platform does not support cumem allocator + init_module = None + python_create_and_map = None + python_unmap_and_release = None + CudaRTLibrary = None + lib_name = None + libcudart = None + +# py_device, py_alignedSize, py_d_mem, py_p_memHandle +HandleType = Tuple[int, int, int, int] + + +@dataclasses.dataclass +class AllocationData: + handle: HandleType + tag: str + cpu_backup_tensor: Optional[torch.Tensor] = None + + +def create_and_map(allocation_handle: HandleType) -> None: + python_create_and_map(*allocation_handle) + + +def unmap_and_release(allocation_handle: HandleType) -> None: + python_unmap_and_release(*allocation_handle) + + +def get_pluggable_allocator( + python_malloc_fn: Callable[[int], + int], python_free_func: Callable[[int, int], + None] +) -> torch.cuda.memory.CUDAPluggableAllocator: + init_module(python_malloc_fn, python_free_func) + new_alloc = torch.cuda.memory.CUDAPluggableAllocator( + lib_name, 'my_malloc', 'my_free') + return new_alloc + + +@contextmanager +def use_memory_pool_with_allocator( + python_malloc_fn: Callable[[int], int], + python_free_func: Callable[[int, int], None]) -> None: + new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) + mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator) + with torch.cuda.memory.use_mem_pool(mem_pool): + yield mem_pool + + +class CuMemAllocator: + """ + A singleton class that manages a memory pool for CUDA tensors. + The memory in this pool can be offloaded or discarded when the + allocator sleeps. + + Inside the `use_memory_pool(tag)` context, all tensors created will + be allocated in the memory pool, and has the same tag as the + tag passed to the context. + + When we call `sleep`, all tensors with the specified tag will be + offloaded to CPU memory, and the rest of the tensors will be discarded. + When we call `wake_up`, all tensors that are previously offloaded + will be loaded back to GPU memory, and the rest of the tensors will + have empty memory. + + Why it needs to be a singleton? + When allocated tensors are garbage collected, PyTorch will call + the free callback, which will call the `python_free_callback` method. + The C-extension uses a global variable to store the function of an + instance of this class. If we create multiple instances of this class, + the global variable will be overwritten and the free callback will + not work as expected. + """ + instance: "CuMemAllocator" = None + default_tag: str = "default" + + @staticmethod + def get_instance() -> "CuMemAllocator": + """ + CuMemAllocator is a singleton class. + We cannot call the constructor directly. + Call this method to get the instance. + """ + assert cumem_available, "cumem allocator is not available" + if CuMemAllocator.instance is None: + CuMemAllocator.instance = CuMemAllocator() + return CuMemAllocator.instance + + def __init__(self): + self.pointer_to_data: Dict[int, AllocationData] = {} + self.current_tag: str = CuMemAllocator.default_tag + + def python_malloc_callback(self, allocation_handle: HandleType) -> None: + """ + Internal method to store the allocation data + when memory is allocated in the memory pool.""" + py_d_mem = allocation_handle[2] + self.pointer_to_data[py_d_mem] = AllocationData( + allocation_handle, self.current_tag) + return + + def python_free_callback(self, ptr: int) -> HandleType: + """ + Internal method to look up the allocation data + when memory is freed in the memory pool.""" + data = self.pointer_to_data.pop(ptr) + if data.cpu_backup_tensor is not None: + data.cpu_backup_tensor = None + return data.handle + + def sleep( + self, + offload_tags: Optional[Union[Tuple[str, ...], + str]] = None) -> None: + """ + Put the allocator in sleep mode. + All data in the memory allocation with the specified tag will be + offloaded to CPU memory, and others will be discarded. + + :param offload_tags: The tags of the memory allocation that will be + offloaded. The rest of the memory allocation will be discarded. + """ + if offload_tags is None: + # by default, allocated tensors are offloaded + # when the allocator sleeps + offload_tags = (CuMemAllocator.default_tag, ) + elif isinstance(offload_tags, str): + offload_tags = (offload_tags, ) + + assert isinstance(offload_tags, tuple) + + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + if data.tag in offload_tags: + size_in_bytes = handle[1] + cpu_backup_tensor = torch.empty( + size_in_bytes, + dtype=torch.uint8, + device='cpu', + pin_memory=is_pin_memory_available()) + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes) + data.cpu_backup_tensor = cpu_backup_tensor + unmap_and_release(handle) + + def wake_up(self): + """ + Wake up the allocator from sleep mode. + All data that is previously offloaded will be loaded back to GPU + memory, and the rest of the data will have empty memory.""" + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + create_and_map(handle) + if data.cpu_backup_tensor is not None: + cpu_backup_tensor = data.cpu_backup_tensor + if cpu_backup_tensor is not None: + size_in_bytes = cpu_backup_tensor.numel( + ) * cpu_backup_tensor.element_size() + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes) + data.cpu_backup_tensor = None + + @contextmanager + def use_memory_pool(self, tag: Optional[str] = None): + """ + A context manager to use the memory pool. + All memory allocation created inside the context will be allocated + in the memory pool, and has the specified tag. + + :param tag: The tag of the memory allocation. If None, the default tag + will be used. + """ + if tag is None: + tag = CuMemAllocator.default_tag + + assert isinstance(tag, str) + + old_tag = self.current_tag + self.current_tag = tag + with use_memory_pool_with_allocator(self.python_malloc_callback, + self.python_free_callback): + yield + # PyTorch's bug, calling torch.cuda.empty_cache() will error + # when using pluggable allocator, see + # https://github.com/pytorch/pytorch/issues/145168 . + # if we have some memory allocated and then freed, + # the memory will not be released. + # right now it is fine, because we only use this allocator + # during weight loading and kv cache creation, where we only + # allocate memory. + # TODO: we need to find a way to release the memory, + # i.e. calling torch.cuda.empty_cache() + self.current_tag = old_tag + + def get_current_usage(self) -> int: + """ + Get the total number of bytes allocated in the memory pool. + """ + sum_bytes: int = 0 + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + sum_bytes += handle[1] + return sum_bytes diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d8017909bab4..0552cbf2faad 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1192,6 +1192,11 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): from vllm.platforms import current_platform if not current_platform.is_cpu(): torch.cuda.empty_cache() + try: + torch._C._host_emptyCache() + except AttributeError: + logger.warning( + "torch._C._host_emptyCache() only available in Pytorch >=2.5") def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup], diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4e471d3c54c..ba96484e3fce 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -100,7 +100,6 @@ class EngineArgs: kv_cache_dtype: str = 'auto' seed: int = 0 max_model_len: Optional[int] = None - worker_use_ray: bool = False # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -196,6 +195,8 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None + enable_sleep_mode: bool = False + calculate_kv_scales: Optional[bool] = None def __post_init__(self): @@ -385,12 +386,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'or equal to the number of GPUs available, "mp" will be used to ' 'keep processing on a single host. Otherwise, this will default ' 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' - 'and hpu only support Ray for distributed inference.') + 'only supports Ray for distributed inference.') - parser.add_argument( - '--worker-use-ray', - action='store_true', - help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -942,7 +939,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "Defaults to None, will use the default generation config in vLLM. " "If set to 'auto', the generation config will be automatically " "loaded from model. If set to a folder path, the generation config " - "will be loaded from the specified folder path.") + "will be loaded from the specified folder path. If " + "`max_new_tokens` is specified, then it sets a server-wide limit " + "on the number of output tokens for all requests.") + + parser.add_argument("--enable-sleep-mode", + action="store_true", + default=False, + help="Enable sleep mode for the engine. " + "(only cuda platform is supported)") parser.add_argument( '--calculate-kv-scales', @@ -996,7 +1001,9 @@ def create_model_config(self) -> ModelConfig: override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, - generation_config=self.generation_config) + generation_config=self.generation_config, + enable_sleep_mode=self.enable_sleep_mode, + ) def create_load_config(self) -> LoadConfig: return LoadConfig( @@ -1061,7 +1068,6 @@ def create_engine_config(self, parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, - worker_use_ray=self.worker_use_ray, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, tokenizer_pool_config=TokenizerPoolConfig.create_config( @@ -1269,11 +1275,22 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None: self.enable_chunked_prefill = True # When no user override, set the default values based on the usage # context. - # TODO(woosuk): Tune the default values for different hardware. - default_max_num_batched_tokens = { - UsageContext.LLM_CLASS: 8192, - UsageContext.OPENAI_API_SERVER: 2048, - } + # Use different default values for different hardware. + from vllm.platforms import current_platform + device_name = current_platform.get_device_name().lower() + if "h100" in device_name or "h200" in device_name: + # For H100 and H200, we use larger default values. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 16384, + UsageContext.OPENAI_API_SERVER: 8192, + } + else: + # TODO(woosuk): Tune the default values for other hardware. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 8192, + UsageContext.OPENAI_API_SERVER: 2048, + } + if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): self.max_num_batched_tokens = default_max_num_batched_tokens[ diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 08fef8250d48..739ea06ae381 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1182,6 +1182,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: self.engine.stop_profile() + async def reset_prefix_cache(self) -> None: + self.engine.reset_prefix_cache() + async def add_lora(self, lora_request: LoRARequest) -> None: self.engine.add_lora(lora_request) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 325c65800356..8f6dc8728939 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -914,6 +914,14 @@ def has_unfinished_requests_for_virtual_engine( """ return self.scheduler[virtual_engine].has_unfinished_seqs() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + + success = True + for scheduler in self.scheduler: + success = success and scheduler.reset_prefix_cache() + return success + @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, @@ -1818,6 +1826,16 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.model_executor.stop_profile() + def sleep(self, level: int = 1) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.sleep(level=level) + + def wake_up(self) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.wake_up() + def check_health(self) -> None: if self.tokenizer: self.tokenizer.check_health() @@ -1857,46 +1875,44 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: metrics = seq_group.metrics ttft = metrics.first_token_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time - # attribute names are based on - # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md - seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, + seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, self.model_config.model) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, seq_group.request_id) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, seq_group.sampling_params.temperature) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, seq_group.sampling_params.top_p) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, seq_group.sampling_params.n) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES, seq_group.num_seqs()) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, len(seq_group.prompt_token_ids)) seq_span.set_attribute( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, sum([ seq.get_output_len() for seq in seq_group.get_finished_seqs() ])) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, metrics.time_in_queue) seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) if metrics.scheduler_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, metrics.scheduler_time) if metrics.model_forward_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD, metrics.model_forward_time / 1000.0) if metrics.model_execute_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) def _validate_model_inputs(self, inputs: ProcessorInputs, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c8aec8dd3afa..b771c190dd82 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -120,7 +120,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): labelnames=labelnames) buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] if not vllm_config.model_config.enforce_eager: - buckets = vllm_config.compilation_config.capture_sizes.copy() + buckets = vllm_config.compilation_config.\ + cudagraph_capture_sizes.copy() buckets.sort() self.histogram_iteration_tokens = self._histogram_cls( name="vllm:iteration_tokens_total", @@ -258,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Number of emitted tokens.", labelnames=labelnames)) - # Deprecated in favor of vllm:prompt_tokens_total - self.gauge_avg_prompt_throughput = self._gauge_cls( - name="vllm:avg_prompt_throughput_toks_per_s", - documentation="Average prefill throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # Deprecated in favor of vllm:generation_tokens_total - self.gauge_avg_generation_throughput = self._gauge_cls( - name="vllm:avg_generation_throughput_toks_per_s", - documentation="Average generation throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # end-metrics-definitions @@ -634,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram(self.metrics.histogram_max_tokens_request, stats.max_tokens_requests) - def _log_prometheus_interval(self, prompt_throughput: float, - generation_throughput: float) -> None: - # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on - # the vLLM side. Moving forward, we should use counters like - # counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the - # grafana/prometheus side. See - # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 - self.metrics.gauge_avg_prompt_throughput.labels( - **self.labels).set(prompt_throughput) - self.metrics.gauge_avg_generation_throughput.labels( - **self.labels).set(generation_throughput) - def log(self, stats: Stats): """Logs to prometheus and tracked stats every iteration.""" # Log to prometheus. @@ -663,20 +635,6 @@ def log(self, stats: Stats): # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). - prompt_throughput = get_throughput(self.num_prompt_tokens, - now=stats.now, - last_log=self.last_local_log) - generation_throughput = get_throughput( - self.num_generation_tokens, - now=stats.now, - last_log=self.last_local_log) - - self._log_prometheus_interval( - prompt_throughput=prompt_throughput, - generation_throughput=generation_throughput) - if self.spec_decode_metrics is not None: self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate, diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 7132f9840001..d9703b820a77 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum): STOP_PROFILE = 2 +class RPCResetPrefixCacheRequest(Enum): + RESET_PREFIX_CACHE = 1 + + @dataclass class RPCLoadAdapterRequest: lora_request: LoRARequest @@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse: RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, - RPCUProfileRequest, RPCLoadAdapterRequest] + RPCUProfileRequest, RPCLoadAdapterRequest, + RPCResetPrefixCacheRequest] REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, RPCError] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a9ab89953518..5237f63c34c0 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -27,8 +27,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) from vllm.engine.protocol import EngineClient # yapf: enable @@ -262,7 +263,14 @@ async def setup(self): """Setup the client before it starts sending server requests.""" # Start output_loop - self.output_loop = asyncio.create_task(self.run_output_handler_loop()) + if self.output_loop is None: + # only generate once to avoid multiple concurrent output_loops + # this will lead to race conditions and wrong orders of tokens + # returned by the engine + # setup will be called multiple times during the startup of + # the engine + self.output_loop = asyncio.create_task( + self.run_output_handler_loop()) with self.get_data_socket() as socket: # Wait until server is ready. @@ -271,8 +279,9 @@ async def setup(self): self.tracing_flag = response.tracing_enabled # Start health_loop. - self.health_loop = asyncio.create_task( - self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) + if self.health_loop is None: + self.health_loop = asyncio.create_task( + self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) def close(self): """Destroy the ZeroMQ Context.""" @@ -667,6 +676,13 @@ async def stop_profile(self) -> None: await self._send_one_way_rpc_request( request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket) + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + + await self._send_one_way_rpc_request( + request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE, + socket=self.input_socket) + async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" # Uses the same I/O as generate requests diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 3aa9d30549f3..166f89743b3c 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -16,8 +16,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable from vllm.logger import init_logger @@ -237,6 +238,8 @@ def handle_new_input(self): self.stop_profile() elif isinstance(request, RPCLoadAdapterRequest): self._handle_load_adapter_request(request) + elif isinstance(request, RPCResetPrefixCacheRequest): + self.reset_prefix_cache() else: raise ValueError("Unknown RPCRequest Type: " f"{type(request)}") @@ -361,6 +364,9 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.engine.reset_prefix_cache() + def signal_handler(*_) -> None: raise KeyboardInterrupt("MQLLMEngine terminated") diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index f05ff62c4766..de7b2c1b91f5 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -271,6 +271,11 @@ async def stop_profile(self) -> None: """Start profiling the engine""" ... + @abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + ... + @abstractmethod async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 27386daa4bbc..1860ed3d7db5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1132,6 +1132,36 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.llm_engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.llm_engine.reset_prefix_cache() + + def sleep(self, level: int = 1): + """ + Put the engine to sleep. The engine should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + :param level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. The content of kv cache is + forgotten. Level 1 sleep is good for sleeping and waking up the + engine to run the same model again. The model weights are backed + up in CPU memory. Please make sure there's enough CPU memory to + store the model weights. Level 2 sleep will discard both the model + weights and the kv cache. The content of both the model weights + and kv cache is forgotten. Level 2 sleep is good for sleeping and + waking up the engine to run a different model or update the model, + where previous model weights are not needed. It reduces CPU memory + pressure. + """ + self.reset_prefix_cache() + self.llm_engine.sleep(level=level) + + def wake_up(self): + """ + Wake up the engine from sleep mode. See the :meth:`sleep` method + for more details.""" + self.llm_engine.wake_up() + # LEGACY def _convert_v1_inputs( self, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1aeefe86cd05..45cf06566faa 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,5 +1,6 @@ import asyncio import atexit +import gc import importlib import inspect import multiprocessing @@ -55,6 +56,7 @@ PoolingChatRequest, PoolingCompletionRequest, PoolingRequest, PoolingResponse, + RerankRequest, RerankResponse, ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, @@ -67,6 +69,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling +from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) @@ -104,6 +107,11 @@ async def _force_log(): task.add_done_callback(_running_tasks.remove) else: task = None + + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + gc.collect() + gc.freeze() try: yield finally: @@ -300,6 +308,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]: return request.app.state.openai_serving_scores +def rerank(request: Request) -> Optional[JinaAIServingRerank]: + return request.app.state.jinaai_serving_reranking + + def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization @@ -496,6 +508,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +@router.post("/rerank") +@with_cancellation +async def do_rerank(request: RerankRequest, raw_request: Request): + handler = rerank(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Rerank (Score) API") + generator = await handler.do_rerank(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, RerankResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/rerank") +@with_cancellation +async def do_rerank_v1(request: RerankRequest, raw_request: Request): + logger.warning( + "To indicate that the rerank API is not part of the standard OpenAI" + " API, we have located it at `/rerank`. Please update your client" + "accordingly. (Note: Conforms to JinaAI rerank API)") + + return await do_rerank(request, raw_request) + + +@router.post("/v2/rerank") +@with_cancellation +async def do_rerank_v2(request: RerankRequest, raw_request: Request): + return await do_rerank(request, raw_request) + + TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), @@ -506,7 +552,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "default": (EmbeddingCompletionRequest, create_embedding), }, "score": { - "default": (ScoreRequest, create_score), + "default": (RerankRequest, do_rerank) + }, + "rerank": { + "default": (RerankRequest, do_rerank) }, "reward": { "messages": (PoolingChatRequest, create_pooling), @@ -518,6 +567,18 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): }, } +if envs.VLLM_SERVER_DEV_MODE: + + @router.post("/reset_prefix_cache") + async def reset_prefix_cache(raw_request: Request): + """ + Reset the prefix cache. Note that we currently do not check if the + prefix cache is successfully reset in the API server. + """ + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() + return Response(status_code=200) + @router.post("/invocations") async def invocations(raw_request: Request): @@ -741,6 +802,12 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None + state.jinaai_serving_reranking = JinaAIServingRerank( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger + ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 35445449463e..4df75a665bab 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -117,7 +117,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "or JSON format. " "Example (old format): ``'name=path'`` " "Example (new format): " - "``{\"name\": \"name\", \"local_path\": \"path\", " + "``{\"name\": \"name\", \"path\": \"lora_path\", " "\"base_model_name\": \"id\"}``") parser.add_argument( "--prompt-adapters", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 14e41346df77..f89c3f42aab1 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,7 +3,7 @@ import re import time from argparse import Namespace -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union import torch from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -42,23 +42,31 @@ class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") + # Cache class field names + field_names: ClassVar[Optional[Set[str]]] = None + @model_validator(mode="before") @classmethod def __log_extra_fields__(cls, data): - if isinstance(data, dict): + + field_names = cls.field_names + if field_names is None: + if not isinstance(data, dict): + return data # Get all class field names and their potential aliases field_names = set() for field_name, field in cls.model_fields.items(): field_names.add(field_name) - if hasattr(field, 'alias') and field.alias: - field_names.add(field.alias) - - # Compare against both field names and aliases - extra_fields = data.keys() - field_names - if extra_fields: - logger.warning( - "The following fields were present in the request " - "but ignored: %s", extra_fields) + if alias := getattr(field, 'alias', None): + field_names.add(alias) + cls.field_names = field_names + + # Compare against both field names and aliases + if any(k not in field_names for k in data): + logger.warning( + "The following fields were present in the request " + "but ignored: %s", + data.keys() - field_names) return data @@ -372,13 +380,17 @@ def to_beam_search_params( ) -> BeamSearchParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get( "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]) @@ -398,11 +410,16 @@ def to_sampling_params( default_sampling_params: Optional[dict] = None) -> SamplingParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -732,13 +749,17 @@ def to_beam_search_params( default_sampling_params: Optional[dict] = None ) -> BeamSearchParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get("temperature", 1.0) @@ -756,11 +777,16 @@ def to_sampling_params( logits_processor_pattern: Optional[str], default_sampling_params: Optional[dict] = None) -> SamplingParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -992,6 +1018,52 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) +class RerankRequest(OpenAIBaseModel): + model: str + query: str + documents: List[str] + top_n: int = Field(default_factory=lambda: 0) + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-rerank-pooling-params + additional_data: Optional[Any] = None + # doc: end-rerank-pooling-params + + # doc: begin-rerank-extra-params + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + # doc: end-rerank-extra-params + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + +class RerankDocument(BaseModel): + text: str + + +class RerankResult(BaseModel): + index: int + document: RerankDocument + relevance_score: float + + +class RerankUsage(BaseModel): + total_tokens: int + + +class RerankResponse(OpenAIBaseModel): + id: str + model: str + usage: RerankUsage + results: List[RerankResult] + + class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) @@ -1211,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameters of the request. - body: Union[ChatCompletionRequest, EmbeddingRequest] + body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] class BatchResponseData(OpenAIBaseModel): @@ -1222,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel): request_id: str # The body of the response. - body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None + body: Optional[Union[ChatCompletionResponse, EmbeddingResponse, + ScoreResponse]] = None class BatchRequestOutput(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f8f136f9d502..37ae23506ace 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -16,12 +16,14 @@ BatchRequestOutput, BatchResponseData, ChatCompletionResponse, - EmbeddingResponse, ErrorResponse) + EmbeddingResponse, ErrorResponse, + ScoreResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) +from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable, tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) - if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)): + if isinstance(response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, @@ -239,6 +242,12 @@ async def main(args): chat_template=None, chat_template_content_format="auto", ) if model_config.task == "embed" else None + openai_serving_scores = (OpenAIServingScores( + engine, + model_config, + openai_serving_models, + request_logger=request_logger, + ) if model_config.task == "score" else None) tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) @@ -279,14 +288,28 @@ async def main(args): )) continue + response_futures.append(run_request(handler_fn, request, tracker)) + tracker.submitted() + elif request.url == "/v1/score": + handler_fn = (None if openai_serving_scores is None else + openai_serving_scores.create_score) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Scores API", + )) + continue + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() else: response_futures.append( make_async_error_request_output( request, - error_msg="Only /v1/chat/completions and " - "/v1/embeddings are supported in the batch endpoint.", + error_msg= + "Only /v1/chat/completions, /v1/embeddings, and /v1/score " + "are supported in the batch endpoint.", )) with tracker.pbar(): diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 3da447be0643..8d54164e500e 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -26,7 +26,8 @@ DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, ScoreRequest, + ErrorResponse, RerankRequest, + ScoreRequest, TokenizeChatRequest, TokenizeCompletionRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -204,9 +205,9 @@ def _validate_input( token_num = len(input_ids) # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens - if isinstance( - request, - (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)): + if isinstance(request, + (EmbeddingChatRequest, EmbeddingCompletionRequest, + ScoreRequest, RerankRequest)): operation = "score" if isinstance(request, ScoreRequest) \ else "embedding generation" diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py new file mode 100644 index 000000000000..be4420261afe --- /dev/null +++ b/vllm/entrypoints/openai/serving_rerank.py @@ -0,0 +1,206 @@ +import asyncio +from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument, + RerankRequest, RerankResponse, + RerankResult, RerankUsage) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.utils import make_async, merge_async_iterators + +logger = init_logger(__name__) + + +class JinaAIServingRerank(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + ) -> None: + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger) + + async def do_rerank( + self, + request: RerankRequest, + raw_request: Optional[Request] = None + ) -> Union[RerankResponse, ErrorResponse]: + """ + Rerank API based on JinaAI's rerank API; implements the same + API interface. Designed for compatibility with off-the-shelf + tooling, since this is a common standard for reranking APIs + + See example client implementations at + https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py + numerous clients use this standard. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + model_name = request.model + request_id = f"rerank-{self._base_request_id(raw_request)}" + truncate_prompt_tokens = request.truncate_prompt_tokens + query = request.query + documents = request.documents + request_prompts = [] + engine_prompts = [] + top_n = request.top_n if request.top_n > 0 else len(documents) + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for scoring models") + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "MistralTokenizer not supported for cross-encoding") + + if not self.model_config.is_cross_encoder: + raise ValueError("Model is not cross encoder.") + + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + for doc in documents: + request_prompt = f"{query}{tokenizer.sep_token}{doc}" + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=query, + text_pair=doc, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + + try: + pooling_params = request.to_pooling_params() + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs(request_id_item, + request_prompts[i], + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + result_generator = merge_async_iterators(*generators) + + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch = [None] * num_prompts + + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch) + + response = self.request_output_to_rerank_response( + final_res_batch_checked, request_id, model_name, documents, + top_n) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response + + def request_output_to_rerank_response( + self, final_res_batch: List[PoolingRequestOutput], request_id: str, + model_name: str, documents: List[str], + top_n: int) -> RerankResponse: + """ + Convert the output of do_rank to a RerankResponse + """ + results: List[RerankResult] = [] + num_prompt_tokens = 0 + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + result = RerankResult( + index=idx, + document=RerankDocument(text=documents[idx]), + relevance_score=classify_res.outputs.score, + ) + results.append(result) + prompt_token_ids = final_res.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + + # sort by relevance, then return the top n if set + results.sort(key=lambda x: x.relevance_score, reverse=True) + if top_n < len(documents): + results = results[:top_n] + + return RerankResponse( + id=request_id, + model=model_name, + results=results, + usage=RerankUsage(total_tokens=num_prompt_tokens)) diff --git a/vllm/envs.py b/vllm/envs.py index a6595cd6164d..a98448f1caa5 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -19,6 +19,7 @@ VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True VLLM_USE_ROCM_FP8_FLASH_ATTN: bool = False RANK: int = 0 + VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 @@ -78,8 +79,6 @@ VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False - VLLM_SYNC_SERVER_ACCUM_REQUESTS: int = 1 - VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS: int = 1 VLLM_MOE_PADDING: bool = False VLLM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True @@ -88,6 +87,8 @@ Q_SCALE_CONSTANT: int = 20 K_SCALE_CONSTANT: int = 20 V_SCALE_CONSTANT: int = 10 + VLLM_SERVER_DEV_MODE: bool = False + VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 def get_default_cache_root(): @@ -104,6 +105,12 @@ def get_default_config_root(): ) +def maybe_convert_int(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value) + + # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -232,6 +239,11 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), + # Force vllm to use a specific flash-attention version (2 or 3), only valid + # when using the flash-attention backend. + "VLLM_FLASH_ATTN_VERSION": + lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), + # Internal flag to enable Dynamo fullgraph capture "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( @@ -523,14 +535,6 @@ def get_default_config_root(): "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), - # Try to accumulate this many requests before proceeding - "VLLM_SYNC_SERVER_ACCUM_REQUESTS": - lambda: int(os.getenv("VLLM_SYNC_SERVER_ACCUM_REQUESTS", "1")), - - # Poll for new requests every this many steps - "VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS": - lambda: int(os.getenv("VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS", "1")), - # Pad the weight for moe kernel or not "VLLM_MOE_PADDING": lambda: bool(int(os.getenv("VLLM_MOE_PADDING", "0"))), @@ -552,7 +556,6 @@ def get_default_config_root(): # for FP8 KV Cache and attention "V_SCALE_CONSTANT": lambda: int(os.getenv("V_SCALE_CONSTANT", "10")), - # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), @@ -560,6 +563,22 @@ def get_default_config_root(): lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": + lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), } # end-env-vars-definition diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 859e105f15d9..471d1bfac311 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -47,6 +47,7 @@ def __init__( self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self._init_executor() + self.is_sleeping = False @abstractmethod def _init_executor(self) -> None: @@ -193,6 +194,20 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.collective_rpc("stop_profile") + def sleep(self, level: int = 1): + if self.is_sleeping: + logger.warning("Executor is already sleeping.") + return + self.collective_rpc("sleep", kwargs=dict(level=level)) + self.is_sleeping = True + + def wake_up(self): + if not self.is_sleeping: + logger.warning("Executor is not sleeping.") + return + self.collective_rpc("wake_up") + self.is_sleeping = False + def save_sharded_state( self, path: str, diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b8163a7acde1..57e85779dd58 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict) - from vllm.multimodal.inputs import MultiModalInputsV2 + from vllm.multimodal.inputs import MultiModalInputs class TextPrompt(TypedDict): @@ -207,7 +207,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: Union[TokenInputs, "MultiModalInputsV2"] + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" - decoder: Union[TokenInputs, "MultiModalInputsV2"] + decoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the decoder portion.""" -SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +SingletonInputs = Union[TokenInputs, "MultiModalInputs"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. @@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - # only the case when we use MultiModalInputsV2 + # only the case when we use MultiModalInputs return inputs.get("mm_hashes", []) # type: ignore[return-value] assert_never(inputs) # type: ignore[arg-type] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0890883cc984..70372e0cad22 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2 +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -247,7 +247,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. @@ -271,7 +271,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" tokenizer_group = self.get_tokenizer_group() tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9809405ca9a6..b77b6b3d72ff 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -273,7 +273,8 @@ def from_local_checkpoint( new_embeddings_tensor_path) elif os.path.isfile(new_embeddings_bin_file_path): embeddings = torch.load(new_embeddings_bin_file_path, - map_location=device) + map_location=device, + weights_only=True) return cls.from_lora_tensors( lora_model_id=get_lora_id() diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 20abaefbacc5..90dfa62ec467 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -20,6 +20,13 @@ def check_object(obj: dict) -> bool: ]): return True + # Check for array unsupported keywords + if obj.get("type") == "array" and any(key in obj for key in [ + "uniqueItems", "contains", "minContains", "maxContains", + "minItems", "maxItems" + ]): + return True + # Recursively check all nested objects and arrays for value in obj.values(): if isinstance(value, dict): diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 66aa2600226d..b6f1d01f8865 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -10,25 +10,25 @@ }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -49,24 +49,24 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "48": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -91,10 +91,10 @@ }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -119,9 +119,9 @@ "1024": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json index 83be69c7e61f..022d5ece7f87 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -34,8 +34,8 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -56,8 +56,8 @@ }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -67,10 +67,10 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 58f9e38f5221..0e5fd1eec77d 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -101,7 +101,7 @@ "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json index 3ee1a5c267dc..d6ad63509f15 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json @@ -1,10 +1,10 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -24,9 +24,9 @@ "4": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -35,7 +35,7 @@ "8": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -45,14 +45,14 @@ }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "24": { "BLOCK_SIZE_M": 16, @@ -78,10 +78,10 @@ }, "48": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -112,7 +112,7 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, @@ -190,7 +190,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 90d0e6f6ba3f..8323f512db01 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -28,10 +28,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -64,18 +64,18 @@ }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -92,7 +92,7 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json index 193782905a72..de2320e4b28c 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -69,19 +69,19 @@ "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -100,9 +100,9 @@ }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -113,7 +113,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -136,7 +136,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -147,7 +147,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 9d1b36acd64d..81bb765d3003 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -1,7 +1,7 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -20,7 +20,7 @@ "4": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -28,10 +28,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -49,7 +49,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -58,7 +58,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -81,8 +81,8 @@ "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -90,18 +90,18 @@ "waves_per_eu": 0 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 8, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -110,7 +110,7 @@ "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -126,11 +126,11 @@ "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json index 2daaea099d09..811c77ab4109 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json @@ -8,7 +8,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "2": { "BLOCK_SIZE_M": 16, @@ -23,10 +23,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -44,11 +44,11 @@ "kpack": 2 }, "16": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -63,12 +63,12 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "32": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, @@ -81,7 +81,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -92,7 +92,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -100,25 +100,25 @@ }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 8, + "GROUP_SIZE_M": 4, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { "BLOCK_SIZE_M": 64, @@ -133,13 +133,13 @@ }, "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "1024": { diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 9e28dade2cee..379ca107a946 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -22,7 +22,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -31,7 +31,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -49,7 +49,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -58,7 +58,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -82,10 +82,10 @@ }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -121,23 +121,23 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "2048": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -154,10 +154,10 @@ }, "4096": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json index f885cd13a4ad..5a3f415d5414 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -1,14 +1,14 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, @@ -34,10 +34,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -56,8 +56,8 @@ }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -67,21 +67,21 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -107,18 +107,18 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { "BLOCK_SIZE_M": 64, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index a971953062cf..48bb5f2ccb8e 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -10,10 +10,10 @@ }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -57,35 +57,35 @@ "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, + "GROUP_SIZE_M": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -117,11 +117,11 @@ "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -137,9 +137,9 @@ "2048": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json index 4edf28f3e7c2..a64d06c6d172 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json @@ -23,14 +23,14 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "8": { "BLOCK_SIZE_M": 16, @@ -45,8 +45,8 @@ }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -55,15 +55,15 @@ "kpack": 2 }, "24": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "32": { "BLOCK_SIZE_M": 16, @@ -85,13 +85,13 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, @@ -121,14 +121,14 @@ "kpack": 1 }, "256": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "512": { diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 1bad9550f060..bd2c6fbc1b94 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -28,44 +28,44 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -73,25 +73,25 @@ }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, @@ -119,16 +119,16 @@ "1024": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -153,9 +153,9 @@ "waves_per_eu": 0 }, "4096": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json index f6d70ae78eab..8dec5e3afaba 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -1,7 +1,7 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -12,36 +12,36 @@ }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "16": { "BLOCK_SIZE_M": 16, @@ -52,14 +52,14 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -67,8 +67,8 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, @@ -77,11 +77,11 @@ "kpack": 2 }, "48": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -96,7 +96,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "96": { "BLOCK_SIZE_M": 32, @@ -114,7 +114,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -187,7 +187,7 @@ "kpack": 2 }, "4096": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json index 91260051a533..cd4fb8f11b93 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -13,7 +13,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -40,7 +40,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0 }, @@ -73,37 +73,37 @@ }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 2, + "GROUP_SIZE_M": 4, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "256": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -118,17 +118,17 @@ }, "1024": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "1536": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json index d6220f55015d..cf66868e9d57 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json @@ -1,14 +1,14 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "2": { "BLOCK_SIZE_M": 16, @@ -34,19 +34,19 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -77,11 +77,11 @@ "kpack": 2 }, "48": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -117,15 +117,15 @@ "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 32, + "matrix_instr_nonkdim": 16, "kpack": 2 }, "256": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, - "num_warps": 8, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 811f52d72aeb..542c27ebe2ea 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -40,7 +40,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): @abstractmethod def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError @@ -67,22 +67,24 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): # Fused gate_up_proj (column parallel) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) # down_proj (row parallel) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -302,13 +304,20 @@ def __init__( self.quant_method = quant_config.get_quant_method(self, prefix) assert self.quant_method is not None - self.quant_method.create_weights( - layer=self, - num_experts=num_experts, - hidden_size=hidden_size, - intermediate_size=self.intermediate_size_per_partition, - params_dtype=params_dtype, - weight_loader=self.weight_loader) + moe_quant_params = { + "num_experts": num_experts, + "hidden_size": hidden_size, + "intermediate_size_per_partition": + self.intermediate_size_per_partition, + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + } + # need full intermediate size pre-sharding for WNA16 act order + if (self.quant_method.__class__.__name__ == + "CompressedTensorsWNA16MoEMethod"): + moe_quant_params["intermediate_size_full"] = intermediate_size + + self.quant_method.create_weights(layer=self, **moe_quant_params) def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, @@ -325,19 +334,30 @@ def _load_per_tensor_weight_scale(self, shard_id: str, elif shard_id == "w2": param_data[expert_id] = loaded_weight - def _load_model_weight_or_group_weight_scale(self, shard_dim: int, + def _load_model_weight_or_group_weight_scale(self, + shard_dim: int, expert_data: torch.Tensor, shard_id: str, loaded_weight: torch.Tensor, - tp_rank: int): - # Load grouped weight scales for group quantization - # or model weights + tp_rank: int, + load_full_w2: bool = False): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ if shard_id == "w2": - self._load_w2(shard_id=shard_id, - shard_dim=shard_dim, + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2(shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + load_full=load_full_w2) elif shard_id in ("w1", "w3"): self._load_w13(shard_id=shard_id, shard_dim=shard_dim, @@ -377,15 +397,21 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) expert_data.copy_(loaded_weight) - def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.Tensor, tp_rank: int): + def _load_w2(self, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False): # Index the loaded weight for tp sharding. # down_proj: "RowParallel" so tp sharding on input_dim # Narrow parameter and load. shard_size = expert_data.shape[shard_dim] - loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank, - shard_size) + if not load_full: + loaded_weight = loaded_weight.narrow(shard_dim, + shard_size * tp_rank, + shard_size) # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) @@ -400,8 +426,7 @@ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor, shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int): if shard_id == "w2": - self._load_w2(shard_id=shard_id, - shard_dim=shard_dim, + self._load_w2(shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, tp_rank=tp_rank) @@ -429,7 +454,7 @@ def weight_loader(self, param: torch.nn.Parameter, ] # Fetch the dim to shard the parameter/loaded weight # based on the shard id. This will be whatever - # dimension intermediate_size is used. + # dimension intermediate_size_per_partition is used. SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} expert_data = param.data[expert_id] @@ -437,11 +462,11 @@ def weight_loader(self, param: torch.nn.Parameter, # is_transposed: if the dim to shard the weight # should be flipped. Required by GPTQ, compressed-tensors - # should be whatever dimension intermediate_size is + # should be whatever dimension intermediate_size_per_partition is is_transposed = getattr(param, "is_transposed", False) shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] if is_transposed: - shard_dim = ~shard_dim + shard_dim = int(not shard_dim) # Case input scale: input_scale loading is only supported for fp8 if "input_scale" in weight_name: @@ -493,7 +518,8 @@ def weight_loader(self, param: torch.nn.Parameter, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + load_full_w2=getattr(param, "load_full_w2", False)) elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: self._load_per_tensor_weight_scale(shard_id=shard_id, param=param, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index c28fd0c6737e..0c3c9816878e 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -303,7 +303,7 @@ def __init__(self, quant_config: AWQMarlinConfig): self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): extra_weight_attrs.update({ "is_transposed": @@ -312,17 +312,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, FusedMoeWeightScaleSupported.GROUP.value, }) - w13_qweight = Parameter(torch.empty(num_experts, - hidden_size, - 2 * intermediate_size // - self.quant_config.pack_factor, - dtype=torch.int32), - requires_grad=False) + w13_qweight = Parameter( + torch.empty(num_experts, + hidden_size, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, + dtype=torch.int32), + requires_grad=False) layer.register_parameter("w13_qweight", w13_qweight) set_weight_attrs(w13_qweight, extra_weight_attrs) w2_qweight = Parameter(torch.empty(num_experts, - intermediate_size, + intermediate_size_per_partition, hidden_size // self.quant_config.pack_factor, dtype=torch.int32), @@ -331,13 +332,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, set_weight_attrs(w2_qweight, extra_weight_attrs) num_groups_w13 = hidden_size // self.quant_config.group_size - num_groups_w2 = intermediate_size // self.quant_config.group_size + num_groups_w2 = (intermediate_size_per_partition // + self.quant_config.group_size) # WEIGHT_SCALES # Allocate 2 scales for w1 and w3 respectively. w13_scales = Parameter(torch.empty(num_experts, num_groups_w13, - intermediate_size * 2, + intermediate_size_per_partition * 2, dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_scales", w13_scales) @@ -353,12 +355,13 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, # WEIGHT_ZERO_POINT # Allocate 2 zero points for w1 and w3 respectively. - w13_qzeros = Parameter(torch.empty(num_experts, - num_groups_w13, - 2 * intermediate_size // - self.quant_config.pack_factor, - dtype=torch.int32), - requires_grad=False) + w13_qzeros = Parameter( + torch.empty(num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, + dtype=torch.int32), + requires_grad=False) layer.register_parameter("w13_qzeros", w13_qzeros) set_weight_attrs(w13_qzeros, extra_weight_attrs) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0e564a070186..48214393381f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -9,6 +9,7 @@ QuantizationType) from pydantic import BaseModel +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform +logger = init_logger(__name__) + __all__ = ["CompressedTensorsLinearMethod"] SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config" @@ -79,6 +82,8 @@ def get_quant_method( return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) + if scheme is None: + return UnquantizedLinearMethod() layer.scheme = scheme return CompressedTensorsLinearMethod(self) if isinstance(layer, Attention): @@ -340,10 +345,10 @@ def _get_scheme_from_parts( raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme( - self, - layer: torch.nn.Module, - layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + def get_scheme(self, + layer: torch.nn.Module, + layer_name: Optional[str] = None + ) -> Optional["CompressedTensorsScheme"]: """ compressed-tensors supports non uniform in the following way: @@ -353,10 +358,7 @@ def get_scheme( which can be a full layer_name, a regex for a layer_name, or an nn.Module name. - We first check whether a layer is in the ignore group and use - CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer - - We then detect whether a layer_name is found in any target and + Detect whether a layer_name is found in any target and use the quantization scheme corresponding to the matched target to select the CompressedTensorsScheme used for infernece. """ @@ -394,6 +396,13 @@ def get_scheme( if self.supports_cutlass_24(weight_quant=weight_quant, input_quant=input_quant, sparsity_scheme=sparsity_scheme): + # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels + # currently produce bad output in some cases + if weight_quant is None: + logger.warning_once( + "CompressedTensors24 scheme is disabled for the w16a16 " + "case. Falling back to UnquantizedLinearMethod") + return None # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel scheme = CompressedTensors24(quantized=weight_quant is not None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 4fb8fd84e92d..e1c45f4e42e4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -13,6 +13,7 @@ FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS) +from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs @@ -75,24 +76,26 @@ def __init__( self.static_input_scales = not self.input_quant.dynamic def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): params_dtype = torch.float8_e4m3fn # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -254,6 +257,7 @@ def __init__( self.packed_factor = 32 // config.num_bits self.strategy = config.strategy self.group_size = config.group_size + self.actorder = config.actorder assert config.symmetric, ( "Only symmetric quantization is supported for MoE") @@ -266,9 +270,16 @@ def __init__( f"{WNA16_SUPPORTED_BITS}") def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + assert params_dtype == torch.float16, ( + "float16 is required for MoE compressed models. Set dtype=torch.float16" # noqa: E501 + ) + + intermediate_size_full = extra_weight_attrs.pop( + "intermediate_size_full") + # Will transpose the loaded weight along the # intermediate and hidden dim sizes. Will # shard for TP along the transposed dims @@ -276,35 +287,45 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, "is_transposed": True, "quant_method": self.strategy }) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size // - self.packed_factor, - 2 * intermediate_size, - dtype=torch.int32), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size // self.packed_factor, + 2 * intermediate_size_per_partition, + dtype=torch.int32), requires_grad=False) layer.register_parameter("w13_weight_packed", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - intermediate_size // - self.packed_factor, - hidden_size, - dtype=torch.int32), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + intermediate_size_per_partition // self.packed_factor, + hidden_size, + dtype=torch.int32), requires_grad=False) layer.register_parameter("w2_weight_packed", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + # In the case where we have actorder/g_idx, + # we do not partition the w2 scales + load_full_w2 = self.actorder and self.group_size != -1 + w2_scales_size = (intermediate_size_full + if load_full_w2 else intermediate_size_per_partition) + + self.is_k_full = (not self.actorder) or ( + intermediate_size_per_partition == intermediate_size_full) + if self.strategy == "channel": num_groups_w2 = num_groups_w13 = 1 self.group_size = -1 else: - num_groups_w2 = intermediate_size // self.group_size + num_groups_w2 = w2_scales_size // self.group_size num_groups_w13 = hidden_size // self.group_size - w13_scale = torch.nn.Parameter(torch.ones(num_experts, - num_groups_w13, - 2 * intermediate_size, - dtype=params_dtype), + w13_scale = torch.nn.Parameter(torch.ones( + num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight_scale", w13_scale) set_weight_attrs(w13_scale, extra_weight_attrs) @@ -316,6 +337,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, requires_grad=False) layer.register_parameter("w2_weight_scale", w2_scale) set_weight_attrs(w2_scale, extra_weight_attrs) + set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2}) w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2), requires_grad=False) @@ -335,18 +357,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, ), requires_grad=False, ) - layer.register_parameter("w13_g_idx", w13_g_idx) + layer.register_parameter("w13_weight_g_idx", w13_g_idx) set_weight_attrs(w13_g_idx, extra_weight_attrs) w2_g_idx = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, ) - layer.register_parameter("w2_g_idx", w2_g_idx) + layer.register_parameter("w2_weight_g_idx", w2_g_idx) set_weight_attrs(w2_g_idx, extra_weight_attrs) w13_g_idx_sort_indices = torch.nn.Parameter( @@ -364,7 +386,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, w2_g_idx_sort_indices = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -422,24 +444,55 @@ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int, size_k2 = layer.w2_weight_packed.shape[2] size_k13 = layer.w13_weight_packed.shape[2] - num_experts = layer.w13_g_idx.shape[0] - device = layer.w13_g_idx.device - layer.w13_g_idx = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w2_g_idx = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w13_g_idx_sort_indices = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w2_g_idx_sort_indices = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) + num_experts = layer.w13_weight_g_idx.shape[0] + device = layer.w13_weight_g_idx.device + + # when running models with grouped act order, + # resort to g_idx values provided in checkpoint + if self.actorder == "group": + w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx) + w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx) + w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx) + w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx) + + for e in range(num_experts): + w13_g_idx_sort_indices[e] = torch.argsort( + layer.w13_weight_g_idx[e]).to(torch.int32) + w2_g_idx_sort_indices[e] = torch.argsort( + layer.w2_weight_g_idx[e]).to(torch.int32) + w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][ + w13_g_idx_sort_indices[e]] + w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][ + w2_g_idx_sort_indices[e]] + + replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx) + replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx) + replace_parameter(layer, "w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + replace_parameter(layer, "w2_g_idx_sort_indices", + w2_g_idx_sort_indices) + + else: + layer.w13_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) marlin_w13_qweight = ops.gptq_marlin_moe_repack( layer.w13_weight_packed, @@ -511,9 +564,9 @@ def apply( router_logits, topk_weights, topk_ids, - g_idx1=layer.w13_g_idx, - g_idx2=layer.w2_g_idx, + g_idx1=layer.w13_weight_g_idx, + g_idx2=layer.w2_weight_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, sort_indices2=layer.w2_g_idx_sort_indices, num_bits=self.num_bits, - ) + is_k_full=self.is_k_full) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 61d1c911cd1a..2e1b5e3c2d3b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -62,7 +62,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, **kwargs): assert params_dtype == torch.float16, ( - "float16 is required for marlin24 compressd models. Set dtype=torch.float16" # noqa: E501 + "float16 is required for marlin24 compressed models. Set dtype=torch.float16" # noqa: E501 ) pack_factor = 32 // self.quant_type.size_bits diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 209f12c6dfec..100cbfa4c959 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -52,7 +52,7 @@ def __init__(self, quant_config: ExpertsInt8Config): self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): int8_dtype = torch.int8 @@ -64,26 +64,29 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, extra_weight_attrs['weight_loader'] = wrapped_weight_loader # Fused gate_up_proj (column parallel) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=int8_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=int8_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) # down_proj (row parallel) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=int8_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=int8_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - w13_scale = torch.nn.Parameter(torch.zeros(num_experts, - 2 * intermediate_size, - dtype=torch.float32), + w13_scale = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32), requires_grad=False) layer.register_parameter("w13_scale", w13_scale) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8608be330b80..5ba574d34010 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -165,8 +165,6 @@ def __init__(self, quant_config: Fp8Config): if current_platform.is_rocm(): self.use_marlin = False - self.default_scale = torch.finfo(torch.float32).min - self.block_quant = self.quant_config.weight_block_size is not None if self.block_quant: # Marlin doesn't support block-wise fp8 @@ -218,9 +216,8 @@ def create_weights( layer.output_size_per_partition = output_size_per_partition layer.orig_dtype = params_dtype - fp8_dtype = torch.float8_e4m3fn # WEIGHT - weight_dtype = (fp8_dtype + weight_dtype = (torch.float8_e4m3fn if self.quant_config.is_checkpoint_fp8_serialized else params_dtype) @@ -243,7 +240,7 @@ def create_weights( dtype=torch.float32), weight_loader=weight_loader, ) - scale[:] = self.default_scale + scale[:] = torch.finfo(torch.float32).min layer.register_parameter("weight_scale", scale) else: assert self.quant_config.activation_scheme == "dynamic" @@ -257,7 +254,7 @@ def create_weights( output_dim=0, weight_loader=weight_loader, ) - scale[:] = self.default_scale + scale[:] = torch.finfo(torch.float32).min # The weight_scale_inv name is intentional for deepseekv3 layer.register_parameter("weight_scale_inv", scale) @@ -267,7 +264,7 @@ def create_weights( len(output_partition_sizes), dtype=torch.float32), weight_loader=weight_loader) - scale[:] = self.default_scale + scale[:] = torch.finfo(torch.float32).min layer.register_parameter("input_scale", scale) else: layer.register_parameter("input_scale", None) @@ -308,13 +305,13 @@ def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint is fp8, handle that there are N scales for N # shards in a fused module else: - layer.weight_scale.data[layer.weight_scale.data == - self.default_scale] = 1 + layer.weight_scale.data[layer.weight_scale.data == torch.finfo( + torch.float32).min] = 1 layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data, requires_grad=False) if self.quant_config.activation_scheme == "static": - layer.input_scale.data[layer.input_scale.data == - self.default_scale] = 1 + layer.input_scale.data[layer.input_scale.data == torch.finfo( + torch.float32).min] = 1 layer.input_scale = torch.nn.Parameter(layer.input_scale.data, requires_grad=False) # If using marlin (w8a16), kernel uses channelwise weights, @@ -427,8 +424,8 @@ def __init__(self, quant_config: Fp8Config): self.block_quant = self.quant_config.weight_block_size is not None def create_weights(self, layer: Module, num_experts: int, hidden_size: int, - intermediate_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): + intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): if self.quant_config.is_checkpoint_fp8_serialized: params_dtype = torch.float8_e4m3fn @@ -443,30 +440,34 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, # scales, the output_size of the weights for both the gate and up # layers must be divisible by block_n. # Required by column parallel or enabling merged weights - if intermediate_size % block_n != 0: + if intermediate_size_per_partition % block_n != 0: raise ValueError( f"The output_size of gate's and up's weight = " - f"{intermediate_size} is not divisible by " + f"{intermediate_size_per_partition} is not divisible by " f"weight quantization block_n = {block_n}.") - if (tp_size > 1 and intermediate_size % block_k != 0): + if (tp_size > 1 + and intermediate_size_per_partition % block_k != 0): # Required by row parallel - raise ValueError(f"The input_size of down's weight = " - f"{intermediate_size} is not divisible by " - f"weight quantization block_k = {block_k}.") + raise ValueError( + f"The input_size of down's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_k = {block_k}.") # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -487,7 +488,8 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, w13_weight_scale = torch.nn.Parameter( torch.ones( num_experts, - 2 * ((intermediate_size + block_n - 1) // block_n), + 2 * ((intermediate_size_per_partition + block_n - 1) // + block_n), (hidden_size + block_k - 1) // block_k, dtype=torch.float32, ), @@ -497,7 +499,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, torch.ones( num_experts, (hidden_size + block_n - 1) // block_n, - (intermediate_size + block_k - 1) // block_k, + (intermediate_size_per_partition + block_k - 1) // block_k, dtype=torch.float32, ), requires_grad=False, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 2dbfca9b0769..4dc4b052b041 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -317,7 +317,7 @@ def create_weights( layer: torch.nn.Module, num_experts: int, hidden_size: int, - intermediate_size: int, + intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs, ): @@ -326,7 +326,8 @@ def create_weights( # Supports only sym for now (no zp) if self.quant_config.group_size != -1: scales_size13 = hidden_size // self.quant_config.group_size - scales_size2 = intermediate_size // self.quant_config.group_size + scales_size2 = (intermediate_size_per_partition // + self.quant_config.group_size) strategy = FusedMoeWeightScaleSupported.GROUP.value else: scales_size13 = 1 @@ -342,7 +343,7 @@ def create_weights( torch.empty( num_experts, hidden_size // self.quant_config.pack_factor, - 2 * intermediate_size, + 2 * intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -353,7 +354,8 @@ def create_weights( w2_qweight = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size // self.quant_config.pack_factor, + intermediate_size_per_partition // + self.quant_config.pack_factor, hidden_size, dtype=torch.int32, ), @@ -365,7 +367,7 @@ def create_weights( w13_scales = torch.nn.Parameter( torch.empty(num_experts, scales_size13, - 2 * intermediate_size, + 2 * intermediate_size_per_partition, dtype=torch.half), requires_grad=False, ) @@ -385,7 +387,8 @@ def create_weights( w13_qzeros = torch.nn.Parameter( torch.empty(num_experts, scales_size13, - 2 * intermediate_size // self.quant_config.pack_factor, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, dtype=params_dtype), requires_grad=False, ) @@ -414,7 +417,7 @@ def create_weights( w2_g_idx = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -435,7 +438,7 @@ def create_weights( w2_g_idx_sort_indices = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 586752d3d34e..4824a1180416 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -5,8 +5,8 @@ CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 ScaledMMLinearKernel, ScaledMMLinearLayerConfig) -# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( -# TritonScaledMMLinear) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( + TritonScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import ( XLAScaledMMLinearKernel) from vllm.platforms import PlatformEnum, current_platform @@ -15,9 +15,7 @@ _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CutlassScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], - # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will - # incorrectly attempt to run AZP models if prompted to. - PlatformEnum.ROCM: [CutlassScaledMMLinearKernel], + PlatformEnum.ROCM: [TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], } diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py new file mode 100644 index 000000000000..97ec8cb0500d --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -0,0 +1,38 @@ +from typing import Optional, Tuple + +import torch + +from vllm.platforms import current_platform + +from .cutlass import CutlassScaledMMLinearKernel +from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig + + +class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + if current_platform.is_cpu(): + return ( + False, + "TritonScaledMMLinearKernel requires Triton which is not " + + "currently supported on CPU.") + if not c.input_symmetric: + return (False, + "TritonScaledMMLinearKernel only supports symmetric " + + "quantization.") + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return super().apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 6a3b02da2350..ef54c25a0557 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -79,6 +79,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # These are used in the final Attention.forward() layer._k_scale.copy_(k_scale) layer._v_scale.copy_(v_scale) + layer._k_scale_float = k_scale + layer._v_scale_float = v_scale if (k_scale == 1.0 and v_scale == 1.0 and (layer.kv_cache_dtype != "auto" or envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 3e1924730080..68a395454076 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -60,24 +60,26 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, self.static_input_scales = not self.input_quant.get("is_dynamic") def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): params_dtype = torch.float8_e4m3fn # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 4afa367b387d..996d9b2505d3 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -356,7 +356,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): elif isinstance(param, UninitializedParameter): shape = list(loaded_weight.shape) if output_dim is not None: - shape[output_dim] = shape[output_dim] // self.tp_size + shape[output_dim] = self.num_embeddings_per_partition param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should @@ -382,7 +382,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): else: assert loaded_weight.shape[output_dim] == self.org_vocab_size - # Copy the data. + # Copy the data. Select chunk corresponding to current shard. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) if current_platform.is_hpu(): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index f697c3245f09..527b4307f367 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -114,7 +114,7 @@ def _initialize_model( all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " @@ -142,7 +142,7 @@ def _initialize_model( kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(**kwargs) @@ -1076,8 +1076,8 @@ def _load_weights(self, model_config: ModelConfig, # weight tensor. So TP does not work with pre_quantized bnb models. if pre_quant and get_tensor_model_parallel_world_size() > 1: raise ValueError( - "Prequant BitsAndBytes models with TP is not supported." - "Please try with PP.") + "Prequant BitsAndBytes models with tensor parallelism is not " + "supported. Please try with pipeline parallelism.") load_8bit = False if pre_quant: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 5b4757072353..e359aef9dcb7 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -288,7 +288,8 @@ def _init_model(self): model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config): + with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( vllm_config=self.vllm_config, ) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b70407221312..b764a940b174 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file( pt_filename: str, sf_filename: str, ) -> None: - loaded = torch.load(pt_filename, map_location="cpu") + loaded = torch.load(pt_filename, map_location="cpu", weights_only=True) if "state_dict" in loaded: loaded = loaded["state_dict"] shared = _shared_pointers(loaded) @@ -381,7 +381,9 @@ def np_cache_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, + map_location="cpu", + weights_only=True) for name, param in state.items(): param_path = os.path.join(np_folder, name) with open(param_path, "wb") as f: @@ -447,7 +449,7 @@ def pt_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, map_location="cpu", weights_only=True) yield from state.items() del state torch.cuda.empty_cache() diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 503d1a38d9ee..8c6873de1362 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -30,6 +30,7 @@ from vllm.sequence import IntermediateTensors # yapf: disable +from .idefics2_vision_model import Idefics2VisionConfig from .idefics2_vision_model import ( Idefics2VisionTransformer as Idefics3VisionTransformer) # yapf: enable @@ -50,6 +51,53 @@ class AriaImagePixelInputs(TypedDict): """ +class AriaVisionTransformer(Idefics3VisionTransformer): + + def __init__( + self, + config: Idefics2VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, quant_config, prefix) + # Unlike Idefics3VisionTransformer which uses LayerNorm after the + # final layer, Aria omits this normalization, so we replace it with an + # Identity layer + self.post_layernorm = nn.Identity() + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + + # NOTE: post_layernorm is not used in Aria + if "post_layernorm" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + class AriaProjectorMLP(nn.Module): def __init__( @@ -228,8 +276,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_output = torch.nn.functional.linear(hidden_states, self.router_weight) + hidden_states_copy = hidden_states.clone() + # NOTE: hidden_states will be modified inplace by `FusedMoE` sparse_expert_output = self.experts(hidden_states, router_output) - shared_expert_output = self.shared_experts(hidden_states) + shared_expert_output = self.shared_experts(hidden_states_copy) return sparse_expert_output + shared_expert_output @@ -342,13 +392,7 @@ def get_vision_config(self): return self.get_hf_config().vision_config def get_hf_processor(self): - processor = self.ctx.get_hf_processor(AriaProcessor) - - # Patch for https://github.com/huggingface/transformers/issues/35768 - processor.tokenizer.image_token = "<|img|>" - processor.image_token = "<|img|>" - - return processor + return self.ctx.get_hf_processor(AriaProcessor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -381,7 +425,7 @@ def get_dummy_processor_inputs( } hf_processor = self.info.get_hf_processor() - image_token: str = hf_processor.image_token # type: ignore + image_token: str = hf_processor.tokenizer.image_token # type: ignore return ProcessorInputs( prompt_text=image_token * num_images, @@ -451,7 +495,7 @@ def __init__( quant_config = vllm_config.quant_config self.config = config - self.vision_tower = Idefics3VisionTransformer( + self.vision_tower = AriaVisionTransformer( config.vision_config, quant_config, prefix=f"{prefix}.vision_tower", diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 917b88e80207..b559ac677a74 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -14,12 +14,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -475,36 +475,27 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + image_token_id = vocab[""] num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", - replacement="" * num_image_tokens + "", + target=[bos_token_id], + replacement=PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the trailing bos_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor, info=Blip2ProcessingInfo, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a6634204699c..e834c9004f14 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -28,12 +28,12 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds sep token for chat mode tokenizer = self.info.get_tokenizer() - sep_token_id: int = \ - tokenizer.vocab[tokenizer.sep_token] # type: ignore + vocab = tokenizer.get_vocab() + + sep_token_id = vocab[tokenizer.sep_token] # type: ignore return prompt_tokens + [sep_token_id] @@ -141,39 +142,27 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + image_start_id = vocab[processor.image_start_token] + image_token_id = vocab[processor.image_token] + image_end_id = vocab[processor.image_end_token] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", - replacement="".join([ - processor.image_start_token, - processor.image_token * self.info.get_num_image_tokens(), - processor.image_end_token, - ]), + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_start_id] + image_tokens + [image_end_id]), + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the image_start_token and image_end_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"] + 1, - length=p["length"] - 2) for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - class ChameleonLayerNorm(nn.LayerNorm): diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 4d3d1c329a2c..344832d8b33e 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -249,8 +249,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self.info.get_hf_processor() - image_token_id: int = hf_processor.image_token_id + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token_id = hf_processor.image_token_id + assert isinstance(image_token_id, int) def get_replacement_deepseek_vl2(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 63e7147f84e0..dbf9da50cc9d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ """ PyTorch Fuyu model.""" import math from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) + TypedDict) import torch import torch.nn as nn @@ -30,13 +30,13 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds boa_token_id tokenizer = self.info.get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + vocab = tokenizer.get_vocab() + + boa_token_id = vocab["<0x04>"] return prompt_tokens + [boa_token_id] @@ -202,6 +204,7 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id @@ -215,9 +218,13 @@ def get_replacement_fuyu(item_idx: int): image_width=image_size.width, image_height=image_size.height, ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows - return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + - [bos_token_id]) + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) return [ PromptReplacement( @@ -227,26 +234,6 @@ def get_replacement_fuyu(item_idx: int): ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only |SPEAKER| (image) tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor, info=FuyuProcessingInfo, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1656a3cc9e46..2f1aa2d68653 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "transformer")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: - self.lm_head = self.transformer.wte - else: - self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.lm_head") + self.lm_head = self.lm_head.tie_weights(self.transformer.wte) + self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.startswith("lm_head"): - # GPT-2 ties the weights of the embedding layer and the final - # linear layer. - continue if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. # NOTE: "c_attn.bias" should not be skipped. continue - if not name.startswith("transformer."): + if not name.startswith("transformer.") and not name.startswith( + "lm_head"): name = "transformer." + name if is_pp_missing_parameter(name, self): diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 51296ef0cc08..b518a0a6cbde 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -348,6 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config # Required by MixtralForCausalLM self.model = GraniteMoeModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4c353ae6ffc1..37b91a803d71 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger @@ -19,9 +18,6 @@ logger = init_logger(__name__) -# The type of HF config -C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True) - # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa # which has T = List[torch.Tensor] @@ -34,7 +30,7 @@ @runtime_checkable -class VllmModel(Protocol[C_co, T_co]): +class VllmModel(Protocol[T_co]): """The interface required for all models in vLLM.""" def __init__( @@ -97,7 +93,7 @@ def is_vllm_model( @runtime_checkable -class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): """The interface required for all generative models in vLLM.""" def compute_logits( @@ -143,7 +139,7 @@ def is_text_generation_model( @runtime_checkable -class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForPooling(VllmModel[T], Protocol[T]): """The interface required for all pooling models in vLLM.""" def pooler( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6cceded43a79..296af2aac566 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -24,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -315,13 +315,14 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - processor = self.info.get_hf_processor() - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token + image_break_id = vocab[processor.image_break_token] + image_token_id = hf_config.image_token_index + image_end_id = vocab[processor.image_end_token] vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) @@ -336,10 +337,10 @@ def get_replacement(item_idx: int): image_height=image_size.height, ) - tokens = ([image_token] * ncols + [image_break_token]) * nrows - tokens[-1] = image_end_token + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id - return "".join(tokens) + return tokens return [ PromptReplacement( @@ -746,7 +747,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -805,7 +806,7 @@ def get_replacement_mantis(item_idx: int): for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 6faa79f65d8d..5b0f35b08646 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -816,7 +816,7 @@ def apply_pooling(self, image_features, stride=2): return image_feature def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None @@ -842,8 +842,7 @@ def get_multimodal_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -852,6 +851,34 @@ def get_input_embeddings( [self.config.image_token_index, self.config.video_token_index]) return inputs_embeds + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[NestedTensors] = None, + video_input: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_index, + ) + + if video_input is not None: + video_embeds = self._process_video_pixels(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_index, + ) + + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -871,13 +898,21 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f9ad0c67adab..5a28b1ffbb7b 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -136,6 +136,17 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7a230e5beb36..0fcda81da280 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -30,15 +30,19 @@ VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) +# yapf conflicts with isort for this block +# yapf: disable from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BoundPromptReplacement, - PlaceholderInfo, PromptReplacement) + PlaceholderFeaturesInfo, + PromptReplacement, + PromptReplacementDetails) +# yapf: enable from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -437,7 +441,12 @@ def get_replacement_phi3v(item_idx: int): processor=hf_processor, ) - return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] + image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens + + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) num_images = mm_items.get_count("image", strict=False) @@ -454,7 +463,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, mm_prompt_repls=mm_prompt_repls, @@ -467,11 +476,11 @@ def _apply_prompt_replacements( token_ids = [token_ids[0], *token_ids[2:]] placeholders = { modality: [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality=p.modality, item_idx=p.item_idx, start_idx=p.start_idx - 1, - replacement=p.replacement, + tokens=p.tokens, ) for p in ps ] for modality, ps in placeholders.items() @@ -479,26 +488,6 @@ def _apply_prompt_replacements( return token_ids, text, placeholders - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only <|image|> tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor, info=Phi3VProcessingInfo, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 47d56175261e..fc5aed5c94ab 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -36,13 +36,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -188,7 +188,9 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - processor = self.info.get_hf_processor() + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # Use getattr with default to be compatible with transformers<4.48 audio_token = getattr(processor, "audio_token", "<|AUDIO|>") @@ -197,6 +199,10 @@ def _get_prompt_replacements( audio_eos_token = getattr(processor, "audio_eos_token", "<|audio_eos|>") + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] + feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: audio_output_lengths = [] @@ -208,19 +214,20 @@ def _get_prompt_replacements( audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_placeholders = audio_output_lengths[item_idx] - if num_placeholders == 0: + num_features = audio_output_lengths[item_idx] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return "".join([ - audio_bos_token, - audio_token * num_placeholders, - audio_eos_token, - ]) + audio_tokens = [audio_token_id] * num_features + + return PromptReplacementDetails( + full=[audio_bos_id] + audio_tokens + [audio_eos_id], + features=audio_tokens, + ) return [ PromptReplacement( @@ -240,26 +247,6 @@ def _always_apply_prompt_replacements(self) -> bool: # tokens than the number of audio items) return not hasattr(self.info.get_hf_processor(), "audio_token") - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only <|AUDIO|> tokens should be considered as placeholders, - # so we ignore the audio_bos_token and audio_eos_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"] + 1, - length=p["length"] - 2) for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor( Qwen2AudioMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 34d5c8ad089a..a2778ee73810 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -55,7 +55,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, - NestedTensors, VideoItem) + VideoItem) from vllm.multimodal.parse import (ImageSize, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -953,12 +953,14 @@ def _get_prompt_replacements( hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered placeholder = { - "image": hf_processor.image_token, - "video": hf_processor.video_token, + "image": vocab[hf_processor.image_token], + "video": vocab[hf_processor.video_token], } merge_length = image_processor.merge_size**2 @@ -967,13 +969,13 @@ def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod().item() // merge_length - return placeholder[modality] * num_tokens + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder[modality]] * num_tokens return [ PromptReplacement( modality=modality, - target=placeholder[modality], + target=[placeholder[modality]], replacement=partial(get_replacement_qwen2vl, modality=modality), ) for modality in ("image", "video") @@ -1231,7 +1233,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: @@ -1258,8 +1260,7 @@ def get_multimodal_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -1268,6 +1269,33 @@ def get_input_embeddings( [self.config.image_token_id, self.config.video_token_id]) return inputs_embeds + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -1301,22 +1329,25 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - - # We need to check for usage of mrope here in case there is - # multimodal data. - # TODO (ywang96): move this to model runner in V1. - if multimodal_embeddings is not None and uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index cca42842bc06..1e51018973e8 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -344,10 +344,14 @@ def __init__( self.config = config self.activation_fn = get_act_fn(config.hidden_act) - - # For quantization, we require the hidden size to be a multiple of 64 - quantizable = (config.hidden_size % 64 == 0 - and config.intermediate_size % 64 == 0) + # Special handling for BNB quantization + if quant_config and quant_config.get_name() == "bitsandbytes": + quantizable = True + else: + # For other quantization, we require the hidden size to be a + # multiple of 64 + quantizable = (config.hidden_size % 64 == 0 + and config.intermediate_size % 64 == 0) self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 930142238369..d577e545a473 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -205,11 +205,15 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - placeholder = hf_processor.audio_token_replacement # type: ignore + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + replacement_id = vocab[ + hf_processor.audio_token_replacement] # type: ignore def get_replacement_ultravox(item_idx: int): audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len + return [replacement_id] * int(audio_token_len) # type: ignore return [ PromptReplacement( diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index a1395982af44..57166f05cd9b 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -82,23 +82,25 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN + if current_platform.is_cuda(): + device_available = current_platform.has_device_capability(80) + if device_available and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + selected_backend = _Backend.FLASH_ATTN + else: + logger.warning_once( + "Current `vllm-flash-attn` has a bug inside vision " + "module, so we use xformers backend instead. You can " + "run `pip install flash-attn` to use flash-attention " + "backend.") + selected_backend = _Backend.XFORMERS else: - logger.warning_once( - "Current `vllm-flash-attn` has a bug inside vision module, " - "so we use xformers backend instead. You can run " - "`pip install flash-attn` to use flash-attention backend.") + # For Volta and Turing GPUs, use xformers instead. selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm(): - # ROCM doesn't support xformers - selected_backend = _Backend.TORCH_SDPA else: - selected_backend = _Backend.XFORMERS + # Default to torch SDPA for other non-GPU platforms. + selected_backend = _Backend.TORCH_SDPA return selected_backend diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index c1f3bb0ca33c..b8512b735da9 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -729,7 +729,22 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) - loaded_weights = [(name, loaded_weight) - for name, loaded_weight in weights] mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) - return loader.load_weights(loaded_weights, mapper=mapper) + # add fake zeros bias for k_proj to state_dict + weights = _create_fake_bias_for_k_proj(weights) + return loader.load_weights(weights, mapper=mapper) + + +def _create_fake_bias_for_k_proj( + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: + """ + Create full zeros bias for k_proj weight in self-attention layers. + So that the bias for k_proj in qkv_proj can be initialized with zeros. + """ + for name, weight in weights: + if ".self_attn.k_proj.weight" in name: + bias = torch.zeros(weight.size(0)) + bias_name = name.replace("weight", "bias") + yield from [(name, weight), (bias_name, bias)] + yield name, weight diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 4b6370358521..b35184f6855a 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ -class MultiModalInputsV2(TypedDict): +class MultiModalInputs(TypedDict): """ Represents the outputs of :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fa199a07b4cf..750646ac6e43 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,7 +1,8 @@ import re from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence +from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, + Sequence) from dataclasses import dataclass, field from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, @@ -18,8 +19,8 @@ from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - MultiModalKwargsItem, PlaceholderRange) + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser if TYPE_CHECKING: @@ -28,23 +29,101 @@ logger = init_logger(__name__) _S = TypeVar("_S", str, list[int]) -_PromptSeq = Union[str, list[int]] + +PromptSeq = Union[str, list[int]] +"""A token sequence (list of token IDs) or text.""" + + +@dataclass +class PromptReplacementDetails: + """Details about the replacement token sequence or text.""" + + full: PromptSeq + """The full replacement.""" + + features: PromptSeq + """ + The part of the replacement that corresponds to feature placeholders; + this will be replaced by the output of the vision encoder during model + inference. + """ + + @staticmethod + def from_seq(seq: PromptSeq) -> "PromptReplacementDetails": + return PromptReplacementDetails(full=seq, features=seq) + + +PromptRepl = Union[PromptSeq, PromptReplacementDetails] +""" +The replacement token sequence or text. + +If only part of the replacement corresponds to feature placeholders, you can +use :class:`PromptReplacementDetails` to specify which part. +""" @dataclass class PromptReplacement: """ Defines how to replace portions of an input prompt with placeholder tokens. + + Example: + + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: + + .. code-block:: python + + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) """ modality: str """The modality for which the replacement is made.""" - target: _PromptSeq + target: PromptSeq """The token sequence (or text) to find and replace.""" - replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -107,11 +186,26 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: @dataclass class _BoundPromptSequence: + """ + A :data:`_PromptSeq` bound to a tokenizer to automatically + convert between token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) _text: Optional[str] _token_ids: Optional[list[int]] + @staticmethod + def from_seq( + tokenizer: AnyTokenizer, + seq: PromptSeq, + ) -> "_BoundPromptSequence": + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + def __post_init__(self) -> None: if self._text is None and self._token_ids is None: raise ValueError("At least one of 'text' and 'token_ids' must be " @@ -134,6 +228,12 @@ def token_ids(self) -> list[int]: return self._token_ids +@dataclass +class _BoundPromptReplacementGroup: + full: _BoundPromptSequence + features: _BoundPromptSequence + + @dataclass class BoundPromptReplacement: """ @@ -144,25 +244,19 @@ class BoundPromptReplacement: tokenizer: AnyTokenizer = field(repr=False) modality: str - _target: _PromptSeq - _replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + _target: PromptSeq + _replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) def __post_init__(self) -> None: - self._replacement_cache = dict[int, _BoundPromptSequence]() + self._replacement_cache = dict[int, _BoundPromptReplacementGroup]() @property def target(self) -> _BoundPromptSequence: """The token sequence (or text) to find and replace.""" - target = self._target + return _BoundPromptSequence.from_seq(self.tokenizer, self._target) - return _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=target if isinstance(target, str) else None, - _token_ids=target if isinstance(target, list) else None, - ) - - def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup: """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -177,10 +271,16 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence: else: cache_key = None - bound_replacement = _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=replacement if isinstance(replacement, str) else None, - _token_ids=replacement if isinstance(replacement, list) else None, + if not isinstance(replacement, PromptReplacementDetails): + replacement = PromptReplacementDetails.from_seq(replacement) + + bound_full = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.full) + bound_features = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.features) + bound_replacement = _BoundPromptReplacementGroup( + full=bound_full, + features=bound_features, ) if cache_key is not None: @@ -197,7 +297,7 @@ class _TokenMatch(NamedTuple): def iter_token_matches( token_ids: list[int], match_ids: list[int], -) -> Iterable[_TokenMatch]: +) -> Generator[_TokenMatch]: """ Yield each occurrence of :code:`match_ids` in :code:`token_ids`. @@ -272,15 +372,15 @@ def end_idx(self) -> int: @dataclass -class PlaceholderInfo: +class PlaceholderFeaturesInfo: modality: str item_idx: int start_idx: int - replacement: list[int] + tokens: list[int] @property def length(self) -> int: - return len(self.replacement) + return len(self.tokens) def to_range(self) -> PlaceholderRange: return PlaceholderRange( @@ -314,7 +414,7 @@ def find_text_matches( def _resolve_matches( - prompt: _PromptSeq, + prompt: PromptSeq, mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ @@ -362,10 +462,10 @@ def _replace_matches( replacement = repl_info.get_replacement(item_idx) if isinstance(prompt, str): - repl_seq = replacement.text + repl_seq = replacement.full.text out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) else: - repl_seq = replacement.token_ids + repl_seq = replacement.full.token_ids out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) prev_end_idx = end_idx @@ -408,7 +508,7 @@ def _iter_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Iterable[PlaceholderInfo]: +) -> Iterable[PlaceholderFeaturesInfo]: """ Yield each set of placeholder tokens found in :code:`prompt`. @@ -432,23 +532,33 @@ def _iter_placeholders( for repl_info in modality_repls: replacement = repl_info.get_replacement(item_idx) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len + repl_tokens_full = replacement.full.token_ids + repl_len_full = len(repl_tokens_full) + end_idx_full = start_idx + repl_len_full - if repl_len == 0 or end_idx > prompt_len: + if repl_len_full == 0 or end_idx_full > prompt_len: continue - if prompt[start_idx:end_idx] == repl_tokens: - yield PlaceholderInfo( - modality=modality, - item_idx=item_idx, - start_idx=start_idx, - replacement=repl_tokens, - ) + if prompt[start_idx:end_idx_full] == repl_tokens_full: + repl_tokens_feat = replacement.features.token_ids + + try: + match = next( + iter_token_matches(repl_tokens_full, + repl_tokens_feat)) + yield PlaceholderFeaturesInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx + match.start_idx, + tokens=repl_tokens_feat, + ) + except StopIteration: + raise AssertionError( + f"{repl_tokens_feat=} should be a " + f"subsequence of {repl_tokens_full=}") from None # Exclude overlapping matches - start_idx = end_idx + start_idx = end_idx_full item_idx_by_modality[modality] += 1 found = True break @@ -464,7 +574,7 @@ def find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Mapping[str, list[PlaceholderInfo]]: +) -> Mapping[str, list[PlaceholderFeaturesInfo]]: it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) return dict(full_groupby_modality(it)) @@ -609,7 +719,7 @@ def __call__( prompt: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) def _get_data_parser(self) -> MultiModalDataParser: @@ -679,7 +789,7 @@ def _find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> Mapping[str, list[PlaceholderInfo]]: + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: return find_mm_placeholders(mm_prompt_repls, new_token_ids, mm_item_counts) @@ -948,7 +1058,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: tokenizer = self.info.get_tokenizer() mm_token_matches = { @@ -1037,7 +1147,7 @@ def _validate_mm_kwargs( def _validate_mm_placeholders( self, - mm_placeholders: Mapping[str, list[PlaceholderInfo]], + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], mm_item_counts: Mapping[str, int], *, allow_missing: bool = False, @@ -1067,7 +1177,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1169,7 +1279,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ec580cd6ecdd..c68edaff8016 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -11,7 +11,7 @@ from vllm.inputs import DummyData from vllm.logger import init_logger -from .inputs import MultiModalDataDict, MultiModalInputsV2 +from .inputs import MultiModalDataDict, MultiModalInputs from .processing import BaseMultiModalProcessor, BaseProcessingInfo logger = init_logger(__name__) @@ -106,7 +106,7 @@ def processing_info(self) -> BaseProcessingInfo: def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: return self.processor.dummy_inputs - def _get_mm_limits(self) -> Mapping[str, int]: + def get_mm_limits(self) -> Mapping[str, int]: mm_config = self.processing_info.ctx.get_mm_config() mm_limit_per_prompt = mm_config.limit_per_prompt @@ -131,7 +131,7 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( seq_len, mm_counts) @@ -146,7 +146,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData - mm_counts = self._get_mm_limits() + mm_counts = self.get_mm_limits() info = self.processing_info mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index aaf7ff34ca57..7a4b85385cac 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -17,7 +17,7 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache) -from .profiling import BaseDummyInputsBuilder +from .profiling import BaseDummyInputsBuilder, MultiModalProfiler from .utils import cached_get_tokenizer from .video import VideoPlugin @@ -282,13 +282,13 @@ def get_max_tokens_per_item_by_nonzero_modality( This is currently directly used only in V1 for profiling the memory usage of a model. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { key: max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() - if limits_per_plugin[key] > 0 + if mm_limits[key] > 0 } def get_max_tokens_by_modality( @@ -304,10 +304,10 @@ def get_max_tokens_by_modality( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { - key: limits_per_plugin[key] * max_tokens_per_mm_item + key: mm_limits[key] * max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() } @@ -371,6 +371,15 @@ def get_mm_limits_per_prompt( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + profiler = MultiModalProfiler(processor) + return profiler.get_mm_limits() + return self._limits_by_model[model_config] def register_processor( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1c6bbf77b926..900bed5929b3 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ from functools import lru_cache +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse @@ -26,7 +27,7 @@ if TYPE_CHECKING: from .hasher import MultiModalHashDict - from .inputs import MultiModalPlaceholderDict + from .inputs import MultiModalKwargs, MultiModalPlaceholderDict class MediaConnector: @@ -477,3 +478,39 @@ def merge_and_sort_multimodal_metadata( merged_hashes = None return sorted_modalities, merged_placeholders, merged_hashes + + +def group_mm_inputs_by_modality( + mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]: + """Group consecutive MultiModalKwargs from mm_inputs with the same modality + together into the same list for batching purpose. For MultiModalKwargs with + multiple modalities, put them into their own list. + + Args: + mm_inputs: List of MultiModalKwargs. + + Returns: + list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each + inner list contains consecutive MultiModalKwargs with same modality, or + one with multimodal modalities. + """ + if not mm_inputs: + return [] + + def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]: + # If the input has multiple modalities, return a id as the unique key + # for the mm_input input. + if len(mm_input.modalities) > 1: + return id(mm_input) + + elif len(mm_input.modalities) == 1: + return list(mm_input.modalities)[0] + + # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty, + # this is used to make InternVL with legacy pipeline still work with v1. + else: + return "" + + return [ + list(group) for _, group in groupby(mm_inputs, key=modality_group_func) + ] diff --git a/vllm/outputs.py b/vllm/outputs.py index b519c159b153..25b2265285d1 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,6 +1,6 @@ import time from dataclasses import dataclass -from typing import Dict, Generic, List, Optional +from typing import Dict, Generic, List, MutableSequence, Optional from typing import Sequence as GenericSequence from typing import Union @@ -162,6 +162,26 @@ def new( finished=finished, ) + def add(self, next_output: "RequestOutput") -> None: + """Merge subsequent RequestOutput into this one""" + + self.prompt = next_output.prompt + self.prompt_token_ids = next_output.prompt_token_ids + self.prompt_logprobs = next_output.prompt_logprobs + self.finished |= next_output.finished + + #TODO assuming n == 1 for now + completion = self.outputs[0] + next_completion = next_output.outputs[0] + completion.text += next_completion.text + if not isinstance(completion.token_ids, MutableSequence): + completion.token_ids = list(completion.token_ids) + completion.token_ids.extend(next_completion.token_ids) + if next_completion.logprobs: + assert completion.logprobs is not None + completion.logprobs.extend(next_completion.logprobs) + completion.cumulative_logprob = next_completion.cumulative_logprob + @classmethod def from_seq_group( cls, seq_group: SequenceGroup, use_cache: bool, @@ -172,9 +192,9 @@ def from_seq_group( if seq_group.request_id in seq_id_to_seq_group: group: SequenceGroupBase = seq_id_to_seq_group[ seq_group.request_id] + assembled_seq_group = group.maybe_assemble_group(seq_group) if finished: group.finish_seq(seq_group) - assembled_seq_group = group.maybe_assemble_group(seq_group) if assembled_seq_group is None: return None return cls.from_seq_group(assembled_seq_group, use_cache, diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 6ca95b41dbb0..ddbdc43ca571 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -101,6 +101,10 @@ def cpu_platform_plugin() -> Optional[str]: try: from importlib.metadata import version is_cpu = "cpu" in version("vllm") + if not is_cpu: + import platform + is_cpu = platform.machine().lower().startswith("arm") + except Exception: pass @@ -213,8 +217,11 @@ def __getattr__(name: str): global _init_trace _init_trace = "".join(traceback.format_stack()) return _current_platform - else: + elif name in globals(): return globals()[name] + else: + raise AttributeError( + f"No attribute named '{name}' exists in {__name__}.") __all__ = [ diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 242c2c127979..a32c262c84ef 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,9 @@ +import os from typing import TYPE_CHECKING, Optional import torch +from vllm import envs from vllm.logger import init_logger from .interface import Platform, PlatformEnum, _Backend @@ -58,6 +60,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 128 + if (parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", + None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning( + "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @classmethod def is_pin_memory_available(cls): diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 473b87c89c21..8b2732923c4e 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -89,6 +89,7 @@ def load_peft_weights(model_id: str, adapters_weights = safe_load_file(filename, device=device) else: adapters_weights = torch.load(filename, - map_location=torch.device(device)) + map_location=torch.device(device), + weights_only=True) return adapters_weights diff --git a/vllm/sequence.py b/vllm/sequence.py index 5857f656dfc1..74320db709f9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -815,7 +815,9 @@ def set_finished_time(self, time: Optional[float]) -> None: def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" - return 0 if self.first_seq.is_finished() else 1 + if self.is_single_seq: + return 0 if self.first_seq.is_finished() else 1 + return self.num_seqs() - self.num_finished_seqs() def get_seqs( self, @@ -824,7 +826,10 @@ def get_seqs( if status is None: return self.seqs - return self.seqs if self.first_seq.status == status else [] + if self.is_single_seq: + return self.seqs if self.first_seq.status == status else [] + + return [seq for seq in self.seqs if seq.status == status] def is_encoder_decoder(self) -> bool: return self.encoder_seq is not None @@ -833,19 +838,22 @@ def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq def get_finished_seqs(self) -> List[Sequence]: - return self.seqs if self.first_seq.is_finished() else [] + if self.is_single_seq: + return self.seqs if self.first_seq.is_finished() else [] + + return [seq for seq in self.seqs if seq.is_finished()] def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" - seq = self.first_seq - if not seq.is_finished(): - seq.data.update_num_computed_tokens(num_new_computed_tokens) + for seq in self.seqs: + if not seq.is_finished(): + seq.data.update_num_computed_tokens(num_new_computed_tokens) def get_num_uncomputed_tokens(self) -> int: num_uncomputed_tokens = 0 - seq = self.first_seq - if not seq.is_finished(): - num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() + for seq in self.seqs: + if not seq.is_finished(): + num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() return num_uncomputed_tokens def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: @@ -860,10 +868,14 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: return len(self.get_seqs(status)) def num_finished_seqs(self) -> int: - return 1 if self.first_seq.is_finished() else 0 + if self.is_single_seq: + return 1 if self.seqs[0].is_finished() else 0 + return len(self.get_finished_seqs()) def is_finished(self) -> bool: - return self.first_seq.is_finished() + if self.is_single_seq: + return self.first_seq.is_finished() + return all(seq.is_finished() for seq in self.seqs) def is_prefill(self) -> bool: return self.first_seq.is_prefill() @@ -1391,13 +1403,15 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): @staticmethod def add_request(request_id: str, engine, params, **kwargs): original_params = params - params = original_params.clone() - params.n = 1 group = ParallelSampleSequenceGroup(request_id) seqs = [] for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i + params = copy.deepcopy(original_params) + params.n = 1 + if params.seed is not None: + params.seed += i seq_group = engine._add_processed_request( request_id_i, params=params, @@ -1432,33 +1446,34 @@ def maybe_assemble_group( self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: # in the streaming mode, we will return the assembled sequence - # for the first sequence, and then return None for the rest of - # sequences + # for the first remaining sequence, and then return None for the + # rest of sequences if self.streaming: - if self.seq_id_to_index[seq_group.request_id] == 0: + first_remaining_id = next(iter(self.to_be_finished)) + if seq_group.request_id == first_remaining_id: return self.assembled_seq_group return None # in the non-streaming mode, we will return the assembled sequence - # once after all sequences finish, and then return None for the + # when the last sequences finishes, and then return None for the # rest of the time - - if len(self.to_be_finished) > 0: - return None - - assert self.assembled_seq_group is not None - params = self.assembled_seq_group.sampling_params - assert isinstance(params, SamplingParams) - if not self.output_produced: - self.output_produced = True - if params._real_n is not None: - # Get the top-n sequences. - n = params._real_n or params.n - seqs = self.assembled_seq_group.seqs - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - self.assembled_seq_group.seqs = top_n_seqs - return self.assembled_seq_group - if self.output_produced: - return None + if (len(self.to_be_finished) == 1 + and seq_group.request_id in self.to_be_finished + and seq_group.is_finished()): + assert self.assembled_seq_group is not None + params = self.assembled_seq_group.sampling_params + assert isinstance(params, SamplingParams) + if not self.output_produced: + self.output_produced = True + if params._real_n is not None: + # Get the top-n sequences. + n = params._real_n or params.n + seqs = self.assembled_seq_group.seqs + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + self.assembled_seq_group.seqs = top_n_seqs + return self.assembled_seq_group + if self.output_produced: + return None + return None diff --git a/vllm/tracing.py b/vllm/tracing.py index 50068d8cf9c2..72a3f85118d3 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -16,7 +16,6 @@ OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor - from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator) @@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: return {h: headers[h] for h in TRACE_HEADERS if h in headers} -class SpanAttributes(BaseSpanAttributes): - # The following span attribute names are added here because they are missing - # from the Semantic Conventions for LLM. - LLM_REQUEST_ID = "gen_ai.request.id" - LLM_REQUEST_N = "gen_ai.request.n" - LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" - LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" - LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" - LLM_LATENCY_E2E = "gen_ai.latency.e2e" - LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" +class SpanAttributes: + # Attribute names copied from here to avoid version conflicts: + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md + GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens" + GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens" + GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens" + GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p" + GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature" + GEN_AI_RESPONSE_MODEL = "gen_ai.response.model" + # Attribute names added until they are added to the semantic conventions: + GEN_AI_REQUEST_ID = "gen_ai.request.id" + GEN_AI_REQUEST_N = "gen_ai.request.n" + GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e" + GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" # Time taken in the forward pass for this across all workers - LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward" + GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = ( + "gen_ai.latency.time_in_model_forward") # Time taken in the model execute function. This will include model # forward, block/sync across workers, cpu-gpu sync time and sampling time. - LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute" + GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = ( + "gen_ai.latency.time_in_model_execute") def contains_trace_headers(headers: Mapping[str, str]) -> bool: diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 6ae68161bbd9..74a56cbf57ec 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -145,7 +145,8 @@ def pull_files(self, return for file in files: - destination_file = self.dir + file.removeprefix(base_dir) + destination_file = os.path.join(self.dir, + file.removeprefix(base_dir)) local_dir = Path(destination_file).parent os.makedirs(local_dir, exist_ok=True) self.s3.download_file(bucket_name, file, destination_file) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 294262484f2f..1f1d67fabb24 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer_all_special_tokens_extended = ( tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_vocab = tokenizer.get_vocab() tokenizer_len = len(tokenizer) - max_token_id = max(tokenizer.get_vocab().values()) + max_token_id = max(tokenizer_vocab.values()) # Some tokenizers (e.g., QwenTokenizer) have special tokens that # are added and included in the implementation of the vocab_size # property, but not in get_vocab(); if there is an implementation @@ -96,6 +97,9 @@ def all_special_tokens_extended(self): def max_token_id(self): return max_token_id + def get_vocab(self): + return tokenizer_vocab + def __len__(self): return tokenizer_len diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 841df3994fba..7f5cc906382a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -130,6 +130,7 @@ def __init__(self) -> None: self.total_memory: Optional[int] = None self.architecture: Optional[str] = None self.platform: Optional[str] = None + self.cuda_runtime: Optional[str] = None self.gpu_count: Optional[int] = None self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None @@ -169,6 +170,8 @@ def _report_usage_once(self, model_architecture: str, self.gpu_count = torch.cuda.device_count() self.gpu_type = device_property.name self.gpu_memory_per_device = device_property.total_memory + if current_platform.is_cuda(): + self.cuda_runtime = torch.version.cuda self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py old mode 100644 new mode 100755 index fd36ea8d8806..ce83b1fac6c0 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,8 +9,15 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.envs import VLLM_FLASH_ATTN_VERSION +from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import cdiv -from vllm.vllm_flash_attn import flash_attn_varlen_func +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported) + +logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): @@ -63,7 +70,7 @@ class FlashAttentionMetadata: max_query_len: int query_start_loc: torch.Tensor max_seq_len: int - seq_start_loc: torch.Tensor + seq_lens: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor @@ -71,8 +78,8 @@ class FlashAttentionMetadata: use_cascade: bool common_prefix_len: int cu_prefix_query_lens: Optional[torch.Tensor] - cu_prefix_kv_lens: Optional[torch.Tensor] - cu_suffix_kv_lens: Optional[torch.Tensor] + prefix_kv_lens: Optional[torch.Tensor] + suffix_kv_lens: Optional[torch.Tensor] # For logging. num_input_tokens: int = 0 # Number of tokens including padding. @@ -128,6 +135,25 @@ def __init__( "are not implemented for " "FlashAttentionImpl") + # if hopper default to FA3, otherwise stick to FA2 for now + # TODO(lucas): profile FA3 on ampere to see if it makes sense to + # use FA3 as default for both + if current_platform.get_device_capability()[0] >= 9: + self.fa_version = 3 if is_fa_version_supported(3) else 2 + else: + self.fa_version = 2 + + if VLLM_FLASH_ATTN_VERSION is not None: + assert VLLM_FLASH_ATTN_VERSION in [2, 3] + self.fa_version = VLLM_FLASH_ATTN_VERSION + + if not is_fa_version_supported(self.fa_version): + logger.error("Cannot use FA version %d is not supported due to %s", + self.fa_version, + fa_version_unsupported_reason(self.fa_version)) + + assert is_fa_version_supported(self.fa_version) + def forward( self, layer: torch.nn.Module, @@ -149,10 +175,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( - "key/v_scale is not supported in FlashAttention.") - assert output is not None, "Output tensor must be provided." if attn_metadata is None: @@ -196,7 +218,7 @@ def forward( out=output[:num_actual_tokens], cu_seqlens_q=attn_metadata.query_start_loc, max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, + seqused_k=attn_metadata.seq_lens, max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, causal=True, @@ -204,6 +226,7 @@ def forward( window_size=self.sliding_window, block_table=attn_metadata.block_table, softcap=self.logits_soft_cap, + fa_version=self.fa_version, ) return output @@ -216,8 +239,8 @@ def forward( cu_query_lens=attn_metadata.query_start_loc, max_query_len=attn_metadata.max_query_len, cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, - cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens, - cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens, + prefix_kv_lens=attn_metadata.prefix_kv_lens, + suffix_kv_lens=attn_metadata.suffix_kv_lens, max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, alibi_slopes=self.alibi_slopes, @@ -225,6 +248,7 @@ def forward( logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, common_prefix_len=attn_metadata.common_prefix_len, + fa_version=self.fa_version, ) return output @@ -305,8 +329,8 @@ def cascade_attention( cu_query_lens: torch.Tensor, max_query_len: int, cu_prefix_query_lens: torch.Tensor, - cu_prefix_kv_lens: torch.Tensor, - cu_suffix_kv_lens: torch.Tensor, + prefix_kv_lens: torch.Tensor, + suffix_kv_lens: torch.Tensor, max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], @@ -314,6 +338,7 @@ def cascade_attention( logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, + fa_version: int, ) -> torch.Tensor: assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") # TODO: Support sliding window. @@ -332,7 +357,7 @@ def cascade_attention( k=key_cache, v=value_cache, cu_seqlens_q=cu_prefix_query_lens, - cu_seqlens_k=cu_prefix_kv_lens, + seqused_k=prefix_kv_lens, max_seqlen_q=num_tokens, max_seqlen_k=common_prefix_len, softmax_scale=softmax_scale, @@ -341,6 +366,7 @@ def cascade_attention( block_table=block_table[:1], softcap=logits_soft_cap, return_softmax_lse=True, + fa_version=fa_version, ) # Process suffix per query. @@ -349,7 +375,7 @@ def cascade_attention( k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_suffix_kv_lens, + seqused_k=suffix_kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len - common_prefix_len, softmax_scale=softmax_scale, @@ -358,6 +384,7 @@ def cascade_attention( block_table=block_table[:, num_common_kv_blocks:], softcap=logits_soft_cap, return_softmax_lse=True, + fa_version=fa_version, ) # Merge prefix and suffix outputs, and store the result in output. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bac77443c856..18fdfdfe4a01 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -285,6 +285,56 @@ def free(self, request: Request) -> None: if block.ref_cnt == 0: self.free_block_queue.append(block) + def uncache_blocks(self, request: Request) -> int: + """Uncache the blocks that are no longer full based on the + num_computed_tokens in the given request. This happens when + the blocks were full and cached due to speculative tokens, but the + speculative tokens are not accepted. + + Args: + request: The request. + + Returns: + The number of uncached blocks. + """ + blocks = self.req_to_blocks[request.request_id] + num_computed_tokens = request.num_computed_tokens + num_full_blocks = num_computed_tokens // self.block_size + num_uncached_blocks = 0 + for block in blocks[num_full_blocks:]: + # If the block is not cached, the following blocks are not cached. + if not self._maybe_evict_cached_block(block): + break + num_uncached_blocks += 1 + return num_uncached_blocks + + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.num_gpu_blocks - + self.free_block_queue.num_free_blocks) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Remove all hashes so that no new blocks will hit. + self.cached_block_hash_to_block = defaultdict(dict) + + # Remove all hashes from all blocks. + for block in self.block_pool: + block.reset_hash() + + logger.info("Successfully reset prefix cache") + return True + def get_num_common_prefix_blocks( self, request: Request, @@ -359,7 +409,7 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: # If the block is cached, evict it. if self.enable_caching: - self._evict_cached_block(curr_block) + self._maybe_evict_cached_block(curr_block) curr_block.incr_ref() ret.append(curr_block) @@ -367,13 +417,16 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: return ret - def _evict_cached_block(self, block: KVCacheBlock) -> None: + def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: """ If a block is cached in `cached_block_hash_to_block`, we reset its hash metadata and evict it from the cache. Args: block: The block to evict. + + Returns: + True if the block is evicted, False otherwise. """ block_hash = block.block_hash if block_hash and block_hash in self.cached_block_hash_to_block: @@ -383,6 +436,9 @@ def _evict_cached_block(self, block: KVCacheBlock) -> None: if len(self.cached_block_hash_to_block[block_hash]) == 0: del self.cached_block_hash_to_block[block_hash] + return True + return False + def _get_cached_block(self, block_hash: BlockHashType) -> Optional[KVCacheBlock]: """Get a cached block by the block hash, or None if cache miss. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 64df21d59fef..de7fb1a698df 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -411,6 +411,10 @@ def update_from_output( num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] outputs: List[EngineCoreOutput] = [] + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -421,13 +425,15 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) - for input_id in list(cached_encoder_input_ids): - start_pos = request.mm_positions[input_id]["offset"] - num_tokens = request.mm_positions[input_id]["length"] - if start_pos + num_tokens <= request.num_computed_tokens: - # The encoder output is already processed and stored - # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + # OPTIMIZATION: Avoid list(set) if the set is empty. + if cached_encoder_input_ids: + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] @@ -529,6 +535,9 @@ def get_num_unfinished_requests(self) -> int: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 + def reset_prefix_cache(self) -> bool: + return self.kv_cache_manager.reset_prefix_cache() + def make_stats(self) -> SchedulerStats: return SchedulerStats( num_running_reqs=len(self.running), diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 6d90c38c72cf..abe4952c4baf 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -66,6 +66,11 @@ class EngineCoreProfile: is_start: bool +@dataclass +class EngineCoreResetPrefixCache: + pass + + class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets @@ -74,6 +79,8 @@ class EngineCoreRequestType(enum.Enum): ADD = b'\x00' ABORT = b'\x01' PROFILE = b'\x02' + RESET_PREFIX_CACHE = b'\x03' -EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]] +EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, + EngineCoreResetPrefixCache, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a74699f7513e..917d52d3220b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,9 +2,12 @@ import os from typing import AsyncGenerator, List, Mapping, Optional, Type, Union +import numpy as np + from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient +from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -12,16 +15,17 @@ from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import kill_process_tree +from vllm.utils import cdiv, kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -43,13 +47,15 @@ def __init__( assert start_engine_loop + self.model_config = vllm_config.model_config + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - # TODO(rob): PrometheusStatLogger(), + PrometheusStatLogger(labels=dict( + model_name=self.model_config.served_model_name)), ] - self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( @@ -205,17 +211,23 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. - while True: + finished = False + while not finished: # Note: drain queue without await if possible (avoids # task switching under load which helps performance). - out = q.get_nowait() if q.qsize() > 0 else await q.get() + out = q.get_nowait() if not q.empty() else await q.get() + + # Coalesce any additional queued outputs + while not q.empty(): + next_out = q.get_nowait() + if sampling_params.output_kind == RequestOutputKind.DELTA: + out.add(next_out) + else: + out = next_out # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. - if out.finished: - yield out - break - + finished = out.finished yield out # If the request is disconnected by the client, the @@ -233,22 +245,41 @@ async def _run_output_handler(self): # 1) Pull EngineCoreOutputs from the EngineCore. outputs = await self.engine_core.get_output_async() - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs.outputs) - # NOTE: RequestOutputs are pushed to their queues. - assert len(processed_outputs.request_outputs) == 0 - - # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) + # Split outputs into chunks of at most + # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the + # event loop for too long. + num_outputs = len(outputs.outputs) + if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + slices = (outputs.outputs, ) + else: + slices = np.array_split( + outputs.outputs, + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + + iteration_stats = None + for i, outputs_slice in enumerate(slices): + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs_slice, iteration_stats) + # NOTE: RequestOutputs are pushed to their queues. + assert not processed_outputs.request_outputs + iteration_stats = processed_outputs.iteration_stats + + # Allow other asyncio tasks to run between chunks + if i + 1 < len(slices): + await asyncio.sleep(0) + + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in - # background thread once we add Prometheus. + # background thread once Prometheus overhead is non-trivial. + assert iteration_stats is not None self._log_stats( scheduler_stats=outputs.scheduler_stats, - iteration_stats=processed_outputs.iteration_stats, + iteration_stats=iteration_stats, ) except Exception as e: @@ -321,6 +352,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: await self.engine_core.profile_async(False) + async def reset_prefix_cache(self) -> None: + await self.engine_core.reset_prefix_cache_async() + @property def is_running(self) -> bool: return True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 26ebc7edcf03..cf94033a38d9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,7 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -135,6 +135,9 @@ def shutdown(self): def profile(self, is_start: bool = True): self.model_executor.profile(is_start) + def reset_prefix_cache(self): + self.scheduler.reset_prefix_cache() + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -247,6 +250,8 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: self.add_request(request) elif isinstance(request, EngineCoreProfile): self.model_executor.profile(request.is_start) + elif isinstance(request, EngineCoreResetPrefixCache): + self.reset_prefix_cache() else: # TODO: make an EngineCoreAbort wrapper assert isinstance(request, list) @@ -271,7 +276,9 @@ def process_input_socket(self, input_path: str): request = decoder_add_req.decode(request_data) elif request_type == EngineCoreRequestType.ABORT.value: request = decoder_abort_req.decode(request_data) - elif request_type == EngineCoreRequestType.PROFILE.value: + elif request_type in ( + EngineCoreRequestType.PROFILE.value, + EngineCoreRequestType.RESET_PREFIX_CACHE.value): request = pickle.loads(request_data) else: raise ValueError(f"Unknown RequestType: {request_type}") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ac0f0f14bf1a..f3b992d6873e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,8 +1,9 @@ +import asyncio import os import signal import weakref from abc import ABC, abstractmethod -from typing import List, Type +from typing import List, Optional, Type import msgspec import zmq @@ -14,7 +15,7 @@ make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder @@ -69,6 +70,9 @@ def add_request(self, request: EngineCoreRequest) -> None: def profile(self, is_start: bool = True) -> None: raise NotImplementedError + def reset_prefix_cache(self) -> None: + raise NotImplementedError + def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -81,6 +85,9 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: async def profile_async(self, is_start: bool = True) -> None: raise NotImplementedError + async def reset_prefix_cache_async(self) -> None: + raise NotImplementedError + async def abort_requests_async(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -108,12 +115,15 @@ def abort_requests(self, request_ids: List[str]) -> None: if len(request_ids) > 0: self.engine_core.abort_requests(request_ids) - def shutdown(self): + def shutdown(self) -> None: self.engine_core.shutdown() def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) + def reset_prefix_cache(self) -> None: + self.engine_core.reset_prefix_cache() + class MPClient(EngineCoreClient): """ @@ -229,6 +239,10 @@ def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + def reset_prefix_cache(self) -> None: + self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) + class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" @@ -242,10 +256,24 @@ def __init__(self, vllm_config: VllmConfig, log_stats=True, ) + self.outputs_queue: Optional[asyncio.Queue[bytes]] = None + self.queue_task: Optional[asyncio.Task] = None + async def get_output_async(self) -> EngineCoreOutputs: + if self.outputs_queue is None: + # Perform IO in separate task to parallelize as much as possible + self.outputs_queue = asyncio.Queue() + + async def process_outputs_socket(): + assert self.outputs_queue is not None + while True: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self.outputs_queue.put_nowait(frame.buffer) - frames = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frames[0].buffer) + self.queue_task = asyncio.create_task(process_outputs_socket()) + + return self.decoder.decode(await self.outputs_queue.get()) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -266,3 +294,7 @@ async def abort_requests_async(self, request_ids: List[str]) -> None: async def profile_async(self, is_start: bool = True) -> None: await self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + + async def reset_prefix_cache_async(self) -> None: + await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f5999ccda644..55d314ebeb95 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -162,6 +162,9 @@ def start_profile(self): def stop_profile(self): self.engine_core.profile(False) + def reset_prefix_cache(self): + self.engine_core.reset_prefix_cache() + def get_tokenizer_group( self, group_type: Type[_G] = BaseTokenizerGroup, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 749f4f5043c9..564eab51bd3a 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -101,6 +101,7 @@ def add_request( def process_outputs( self, engine_core_outputs: List[EngineCoreOutput], + iteration_stats: Optional[IterationStats] = None, ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: @@ -133,7 +134,8 @@ def process_outputs( request_outputs: List[RequestOutput] = [] reqs_to_abort: List[str] = [] - iteration_stats = IterationStats(self.log_stats) + if not iteration_stats: + iteration_stats = IterationStats(self.log_stats) for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) @@ -175,8 +177,8 @@ def process_outputs( iteration_stats=iteration_stats, ) + @staticmethod def _make_request_output( - self, request_state: RequestState, detokenizer_output: Optional[DetokenizerOutput], ) -> Optional[RequestOutput]: diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8feeef17542e..b84f03fa3267 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,5 +1,8 @@ import time from abc import ABC, abstractmethod +from typing import Dict + +import prometheus_client from vllm.logger import init_logger from vllm.v1.metrics.stats import SchedulerStats @@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats): scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, labels: Dict[str, str]): + self.labels = labels + + labelnames = self.labels.keys() + labelvalues = self.labels.values() + + self._unregister_vllm_metrics() + + self.gauge_scheduler_running = prometheus_client.Gauge( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats): + """Log to prometheus.""" + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + + @staticmethod + def _unregister_vllm_metrics(): + # Unregister any existing vLLM collectors (for CI/CD + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index acc3a944e21b..32aee44e3f37 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -8,7 +8,7 @@ class SamplerOutput: # [num_reqs] - sampled_token_ids: List[int] + sampled_token_ids: torch.Tensor # [num_reqs, max_num_logprobs + 1] logprob_token_ids: Optional[torch.Tensor] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 45450165eaef..2cfcd8b63ccb 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -58,12 +58,19 @@ def __init__( # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) - assert len(self.mm_inputs) == len(self.mm_hashes) + if self.mm_hashes: + assert len(self.mm_inputs) == len(self.mm_hashes) # Cache the computed kv block hashes of the request to avoid # recomputing. self._kv_block_hashes: List[BlockHashType] = [] + # Read-only views + # Prevent directly appending to the these lists since + # they should also be updated simultaneously. + self.output_token_ids = ConstantList(self._output_token_ids) + self.all_token_ids = ConstantList(self._all_token_ids) + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": return cls( @@ -79,18 +86,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": lora_request=request.lora_request, ) - @property - def output_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the output_token_ids since - # all_token_ids should also be updated simultaneously. - return ConstantList(self._output_token_ids) - - @property - def all_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the all_token_ids since - # output_token_ids should also be updated simultaneously - return ConstantList(self._all_token_ids) - def append_output_token_ids( self, token_ids: Union[int, List[int]], diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 7cd42ca211a2..9ad665a64894 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -50,9 +50,8 @@ def forward( # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) - # NOTE: CPU-GPU synchronization happens here. sampler_output = SamplerOutput( - sampled_token_ids=sampled.tolist(), + sampled_token_ids=sampled, logprob_token_ids=topk_indices, logprobs=topk_logprobs, prompt_logprob_token_ids=None, diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py new file mode 100644 index 000000000000..500bc356fc17 --- /dev/null +++ b/vllm/v1/stats/common.py @@ -0,0 +1,451 @@ +import time +from dataclasses import dataclass +from dataclasses import field as dataclass_field +from enum import IntEnum +from typing import ClassVar, Dict, List, Optional, Set + +import msgspec +from msgspec import field as msgspec_field + +from vllm.sampling_params import SamplingParams + + +class RequestStatsUpdate( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + An update to the request stats. + + This represents a stats update at a specific timestamp with metadata + associated with the update. + + NOTE: since there might be multiple processes generating updates at + different parts of the engine (e.g. input processor, scheduler, engine core, + etc.), we use the monotonic timestamp to record the update to compute any + intervals, and explicit wall-clock timestamp should be used for timestamps. + + WARNING: This assumes stats are generated in a single machine. If there are + potentially multiple machines, one should always generate the stats updates + on one single machine or use something else. + """ + + class Type(IntEnum): + """See `RequestStats` for the lifecycle of a request.""" + + # Request arrived at the engine frontend. + ARRIVED = 0 + # Input processed by the input processor. + INPUT_PROCESSED = 1 + # Queued on the engine core. + QUEUED = 2 + # Scheduled running prefill by the scheduler. + # A request could be running a new prefill on the prompt tokens or + # a resumed prefill on the original prefill tokens + generated output + # tokens before preemption. + PREFILLING = 3 + # Preempted by the scheduler. + PREEMPTED = 4 + # Output token is generated by the engine core. + DECODING = 5 + # Token detokenized by the detokenizer. + # We will record the timestamp for each output token, as well as the + # finish reason. + DETOKENIZED = 6 + # Request finishes (or aborts). + FINISHED = 7 + + """ + Valid state updates: + ARRIVED + │ + ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐ + │ │ │ │ │ + │ │ │ ▼ │ + │ │ │ -──► DECODING │ + │ │ │ | │ │ + │ │ │ | ▼ │ + │ │ │ └─ DETOKENIZED │ + │ │ │ │ │ + │ │ │ ▼ │ + │ ▼ ▼ PREEMPTED ◄──────┘ + │ │ │ │ + └──────────────┴───────────────────┴──────────────┴ + │ + ▼ + FINISHED (All could go to FINISHED) + """ + _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = { + Type.ARRIVED: { + Type.INPUT_PROCESSED, + Type.FINISHED, + }, + Type.INPUT_PROCESSED: { + Type.QUEUED, + Type.FINISHED, + }, + Type.QUEUED: { + Type.PREFILLING, + Type.FINISHED, + }, + Type.PREFILLING: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.DECODING: { + Type.DETOKENIZED, + Type.FINISHED, + }, + Type.DETOKENIZED: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED}, + Type.FINISHED: set(), + } + + request_id: str + + type: Type + + # Timestamp when the update is recorded. This is used to record time + # intervals between events rather than wall clock time. + monotonic_ts_s: float = msgspec_field( + default_factory=lambda: time.monotonic()) + + ############################################################ + # Metadata associated with the update. + ############################################################ + # For input_processed. Metadata needed for stats logging. + num_prompt_tokens: Optional[int] = None + sampling_params: Optional[SamplingParams] = None + + # For running. + # Number of tokens computed when scheduled to run. + num_computed_tokens: Optional[int] = None + # Number of cached tokens when scheduled to run. + num_cached_tokens: Optional[int] = None + + # For decoded. + # The number of new output tokens generated. + num_new_tokens: Optional[int] = None + + # For both detokenized and decoded. + # Finished reason. + finish_reason: Optional[str] = None + + # Non-optional fields for each update type. + _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = { + Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], + Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], + Type.DETOKENIZED: ["num_new_tokens"], + Type.FINISHED: ["finish_reason"], + } + + def __post_init__(self): + required_fields = self._REQUIRED_FIELDS.get(self.type, []) + for field in required_fields: + if getattr(self, field) is None: + raise ValueError( + f"Field {field} is required for update type {self.type}.") + + @staticmethod + def check_valid_update( + update: "RequestStatsUpdate", + last_update_type: Optional[Type], + last_updated_ts_s: Optional[float], + ): + if last_update_type is None: + assert update.type == RequestStatsUpdate.Type.ARRIVED + else: + valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[ + last_update_type] + assert update.type in valid_cur_update_types, ( + f"Invalid update type: {update.type} for last_update_type: " + f"{last_update_type}.") + + if last_updated_ts_s is not None: + assert update.monotonic_ts_s >= last_updated_ts_s, ( + "Update timestamp must be monotonically increasing, but " + f"last_updated_ts_s={last_updated_ts_s} and " + f"update.monotonic_ts_s={update.monotonic_ts_s}.") + + +@dataclass +class RequestStats: + """Stats associated with a request (`Request`).""" + + ############################################################ + # Metadata + ############################################################ + request_id: str + sampling_params: Optional[SamplingParams] = None + num_prompt_tokens: Optional[int] = None + + ############################################################ + # Metrics and Stats + ############################################################ + # Timestamp when the request was last updated. + last_updated_ts_s: Optional[float] = None + + # Last update stats type. + last_update_type: Optional[RequestStatsUpdate.Type] = None + + # Timestamp when the request arrived at the llm engine. + arrival_ts_s: Optional[float] = None + + # Number of tokens cached. When part of the request prefix is cached, + # this will be set. + num_cached_tokens: int = 0 + + # Number of tokens computed. + num_computed_tokens: int = 0 + + # The timestamp when the request become waiting in the queue. + queued_ts_s: Optional[float] = None + + # When the input processor is completed. + input_processor_end_ts_s: Optional[float] = None + + # A sorted list of timestamps when the request was scheduled to prefill. + # This could be when: + # 1. the request is newly scheduled, so it's a new prefill. + # 2. the request was preempted and resumed. It is equivalent to running + # a prefill of the original prefill tokens + generated output tokens + # before preemption. + prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A list of timestamps when a token is decoded by the engine core. + decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A sorted list of timestamps for each output token. + output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # First token's timestamp. + first_token_ts_s: Optional[float] = None + + # TODO(rickyx): we need model runner to surface these. + model_forward_duration_s: float = 0.0 + # Includes model forward, block/sync across workers, cpu-gpu sync time + # and sampling time. + model_execute_duration_s: float = 0.0 + + # A sorted list of timestamps when the request was preempted at the + # scheduler. + # TODO(rickyx): right now, we don't actually have a good high-level + # metric to measure the impact of preemption other than observation of + # large P99 TPOT. Ideally we could quantify the impact of preemption by + # measuring the number of tokens re-computed due to preemption. + preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # Timestamp when the request was finished at the engine core. + finished_ts_s: Optional[float] = None + + # Finish reason. + finish_reason: Optional[str] = None + + ############################################################ + # Derived properties. + ############################################################ + @property + def prefill_ts_s(self) -> Optional[float]: + """The timestamp when the request started prefilling. + Since a request could be preempted in decoding and later resumed + to prefill the decoded tokens, we use the first prefill start timestamp. + """ + return (self.prefill_start_ts_s_lst[0] + if self.prefill_start_ts_s_lst else None) + + @property + def e2e_latency_s(self) -> Optional[float]: + if self.finished_ts_s is None or self.arrival_ts_s is None: + return None + assert self.finished_ts_s >= self.arrival_ts_s + return self.finished_ts_s - self.arrival_ts_s + + @property + def queue_duration_s(self) -> Optional[float]: + """How long the request was waiting to run.""" + if self.queued_ts_s is None or self.prefill_ts_s is None: + # Either not queued or not running yet. + return None + assert self.queued_ts_s <= self.prefill_ts_s + return self.prefill_ts_s - self.queued_ts_s + + @property + def inference_latency_s(self) -> Optional[float]: + """How long the request was running inference + (prefill and decode).""" + if self.finished_ts_s is None or self.prefill_ts_s is None: + return None + assert self.finished_ts_s >= self.prefill_ts_s + return self.finished_ts_s - self.prefill_ts_s + + @property + def first_token_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.arrival_ts_s is None: + return None + assert self.first_token_ts_s >= self.arrival_ts_s + return self.first_token_ts_s - self.arrival_ts_s + + @property + def prefill_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.prefill_ts_s is None: + return None + assert self.first_token_ts_s >= self.prefill_ts_s + return self.first_token_ts_s - self.prefill_ts_s + + @property + def decode_latency_s(self) -> Optional[float]: + if self.e2e_latency_s is None or self.first_token_latency_s is None: + return None + assert self.e2e_latency_s >= self.first_token_latency_s + return self.e2e_latency_s - self.first_token_latency_s + + @property + def output_token_latency_s_lst(self) -> List[float]: + if len(self.output_token_ts_s_lst) == 0: + return [] + latency_s_lst = [] + for i in range(1, len(self.output_token_ts_s_lst)): + assert (self.output_token_ts_s_lst[i] >= + self.output_token_ts_s_lst[i - 1]) + latency_s = (self.output_token_ts_s_lst[i] - + self.output_token_ts_s_lst[i - 1]) + latency_s_lst.append(latency_s) + return latency_s_lst + + @property + def num_output_tokens(self) -> int: + return len(self.output_token_ts_s_lst) + + @property + def is_finished(self) -> bool: + return self.finished_ts_s is not None + + def update_from(self, update: "RequestStatsUpdate"): + RequestStatsUpdate.check_valid_update(update, self.last_update_type, + self.last_updated_ts_s) + ts = update.monotonic_ts_s + self.last_updated_ts_s = ts + self.last_update_type = update.type + if update.type == RequestStatsUpdate.Type.ARRIVED: + self.arrival_ts_s = ts + elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED: + self.input_processor_end_ts_s = ts + self.sampling_params = update.sampling_params + self.num_prompt_tokens = update.num_prompt_tokens + elif update.type == RequestStatsUpdate.Type.QUEUED: + self.queued_ts_s = ts + elif update.type == RequestStatsUpdate.Type.PREFILLING: + self.prefill_start_ts_s_lst.append(ts) + self.num_cached_tokens = update.num_cached_tokens or 0 + self.num_computed_tokens = update.num_computed_tokens or 0 + elif update.type == RequestStatsUpdate.Type.PREEMPTED: + self._reset_for_preemption(ts) + elif update.type == RequestStatsUpdate.Type.DECODING: + self.decoding_ts_s_lst.append(ts) + elif update.type == RequestStatsUpdate.Type.DETOKENIZED: + self._record_detokenized_output( + ts, + update.num_new_tokens or 0, + ) + elif update.type == RequestStatsUpdate.Type.FINISHED: + self.finished_ts_s = ts + self.finish_reason = update.finish_reason + else: + raise ValueError(f"Unknown update type: {update.type}") + + def _record_detokenized_output( + self, + ts_s: float, + num_new_tokens: int, + ): + # Update if first output token is generated. + if len(self.output_token_ts_s_lst) == 0: + self.first_token_ts_s = ts_s + assert ( + self.prefill_ts_s is not None + ), "Request must be running before generating output tokens." + + # Some X new tokens were generated at the ts. + self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens) + + def _reset_for_preemption(self, ts_s: float): + self.preempted_ts_s_lst.append(ts_s) + # Reset the computed tokens since it might restart the prefill. + self.num_computed_tokens = 0 + # Cached token count might also change when resumed. + self.num_cached_tokens = 0 + # These stats don't change since they happen before request running. + # - arrival_ts_s + # - input_processor_end_ts_s + # - sampling_params + # - num_prompt_tokens + # - first_token_ts_s + # + # These stats are accumulated over preemptions: + # - output_token_ts_s_lst + # - prefill_start_ts_s_lst (after preemption, it will prefill the + # original prefill tokens and any output tokens generated before + # preemption.) + + +@dataclass +class KVCacheStats: + # KV Cache Usage in % + gpu_cache_usage_sys: float = 0.0 + gpu_prefix_cache_hit_rate: float = 0.0 + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + # Number of requests currently running. + num_running_reqs: int = 0 + # Number of requests currently waiting. + num_waiting_reqs: int = 0 + + kv_cache_stats: KVCacheStats = dataclass_field( + default_factory=KVCacheStats) + + +@dataclass +class EngineCoreProcessStats: + """Stats associated with the engine core process.""" + + # Number of requests currently in the input queue. None if the engine core + # is not running in multiprocess mode. + input_queue_size: Optional[int] = None + # Number of outputs currently in the output queue. None if the engine core + # is not running in multiprocess mode. + output_queue_size: Optional[int] = None + + +class EngineCoreStatsSnapshot( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + A snapshot of the EngineCore's current stats over a period of time. + """ + + # Snapshot of the scheduler stats. + scheduler_stats: SchedulerStats = msgspec_field( + default_factory=SchedulerStats) + + # Per request stats updates. + requests_stats_updates: List[RequestStatsUpdate] = msgspec_field( + default_factory=list) + + # Engine core's queue stats. + engine_core_process_stats: EngineCoreProcessStats = msgspec_field( + default_factory=EngineCoreProcessStats) + + # TODO(rickyx): Add other components' stats, + # e.g. model runner/worker and etc. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2350074c23a5..9d7e30079dfb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,6 @@ import gc import time -from typing import TYPE_CHECKING, Dict, List, Tuple, cast +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast import numpy as np import torch @@ -17,6 +17,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LayerBlockType, cdiv, is_pin_memory_available) @@ -127,7 +128,8 @@ def __init__( # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( - reversed(self.vllm_config.compilation_config.capture_sizes)) + reversed( + self.vllm_config.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self.device_properties = torch.cuda.get_device_properties(self.device) @@ -143,28 +145,24 @@ def __init__( # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.model_config.uses_mrope: - # NOTE: `mrope_positions` is implemented as a permuted tensor to - # satisfy the following properties to allow `torch.compile` to work - # properly: - # - shape: (3, ) - # - stride: (1, 3) - # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256 + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 # NOTE: When M-RoPE is enabled, position ids are 3D regardless of # the modality of inputs. For text-only inputs, each dimension has # identical position IDs, making M-RoPE functionally equivalent to # 1D-RoPE. # See page 5 of https://arxiv.org/abs/2409.12191 - self.mrope_positions = torch.zeros((self.max_num_tokens, 3), + self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1), dtype=torch.int64, device=self.device) - self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3), - dtype=torch.int64, - device="cpu", - pin_memory=self.pin_memory) - - self.mrope_positions = self.mrope_positions.permute((1, 0)) - self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0)) + self.mrope_positions_cpu = torch.zeros( + (3, self.max_num_tokens + 1), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), @@ -173,7 +171,8 @@ def __init__( # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np = np.arange(max(self.max_num_reqs + 1, - self.max_model_len), + self.max_model_len, + self.max_num_tokens), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should @@ -198,11 +197,11 @@ def __init__( device="cpu", pin_memory=self.pin_memory) self.query_start_loc_np = self.query_start_loc_cpu.numpy() - self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, - dtype=torch.int32, - device="cpu", - pin_memory=self.pin_memory) - self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() + self.seq_lens_cpu = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.seq_lens_np = self.seq_lens_cpu.numpy() def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. @@ -360,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Get batched arange. # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = np.concatenate( - [self.arange_np[:n] for n in num_scheduled_tokens]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -408,14 +414,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Prepare the attention metadata. self.query_start_loc_np[0] = 0 - np.cumsum(num_scheduled_tokens, - out=self.query_start_loc_np[1:num_reqs + 1]) + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens - seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] + - num_scheduled_tokens) - max_seq_len = seq_lens.max() - self.seq_start_loc_np[0] = 0 - np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1]) + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + max_seq_len = self.seq_lens_np[:num_reqs].max() # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( @@ -432,8 +436,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): non_blocking=True) query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( self.device, non_blocking=True) - seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( - self.device, non_blocking=True) + seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device, + non_blocking=True) slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( self.device, non_blocking=True).long() @@ -505,33 +509,30 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): [0, total_num_scheduled_tokens], dtype=torch.int32, device=self.device) - cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], - dtype=torch.int32, - device=self.device) - cu_suffix_kv_lens = ( - self.seq_start_loc_np[:num_reqs + 1] - - self.arange_np[:num_reqs + 1] * common_prefix_len) - cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( - self.device) + prefix_kv_lens = torch.tensor([common_prefix_len], + dtype=torch.int32, + device=self.device) + suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len) + suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device) else: cu_prefix_query_lens = None - cu_prefix_kv_lens = None - cu_suffix_kv_lens = None + prefix_kv_lens = None + suffix_kv_lens = None attn_metadata = FlashAttentionMetadata( num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, query_start_loc=query_start_loc, max_seq_len=max_seq_len, - seq_start_loc=seq_start_loc, + seq_lens=seq_lens, block_table=( self.input_batch.block_table.get_device_tensor()[:num_reqs]), slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, cu_prefix_query_lens=cu_prefix_query_lens, - cu_prefix_kv_lens=cu_prefix_kv_lens, - cu_suffix_kv_lens=cu_suffix_kv_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, ) # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial # request in the batch. While we should not sample any token from this @@ -629,19 +630,34 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): for input_id in encoder_input_ids: mm_inputs.append(req_state.mm_inputs[input_id]) req_input_ids.append((req_id, input_id)) - batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) - - # Run the encoder. - # `encoder_outputs` is either of the following: - # 1. A tensor of shape [num_images, feature_size, hidden_size] - # in case when feature_size is fixed across all images. - # 2. A list (length: num_images) of tensors, each of shape - # [feature_size, hidden_size] in case when the feature size is - # dynamic depending on input images. - encoder_outputs = self.model.get_multimodal_embeddings( - **batched_mm_inputs) + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + + encoder_outputs = [] + for grouped_mm_inputs in grouped_mm_inputs_list: + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) + + for output in curr_group_outputs: + encoder_outputs.append(output) # Cache the encoder outputs. for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): @@ -766,10 +782,10 @@ def execute_model( sampling_metadata=sampling_metadata, ) - sampled_token_ids = sampler_output.sampled_token_ids # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. num_reqs = self.input_batch.num_reqs + request_seq_lens: List[Tuple[int, CachedRequestState, int]] = [] for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): assert req_id is not None req_state = self.requests[req_id] @@ -778,10 +794,10 @@ def execute_model( assert seq_len <= req_state.num_tokens if seq_len == req_state.num_tokens: # Append the sampled token to the output token ids. - token_id = sampled_token_ids[i] - self.input_batch.token_ids_cpu[i, seq_len] = token_id self.input_batch.num_tokens[i] += 1 - req_state.output_token_ids.append(token_id) + # OPTIMIZATION: Priming the state updates for later updates. + req_state.output_token_ids.append(0) + request_seq_lens.append((i, req_state, seq_len)) else: # Ignore the sampled token from the partial request. # Rewind the generator state as if the token was not sampled. @@ -790,6 +806,21 @@ def execute_model( # This relies on cuda-specific torch-internal impl details generator.set_offset(generator.get_offset() - 4) + # num_reqs entries should be non-None + assert all( + req_id is not None for req_id in + self.input_batch.req_ids[:num_reqs]), "req_ids contains None" + req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + + # NOTE: GPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + sampled_token_ids = sampler_output.sampled_token_ids.tolist() + # Update with the actual token ids + for i, req_state, seq_len in request_seq_lens: + token_id = sampled_token_ids[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids[-1] = token_id + if sampler_output.logprob_token_ids is None: logprob_token_ids = None else: @@ -799,12 +830,6 @@ def execute_model( else: logprobs = sampler_output.logprobs.cpu() - # num_reqs entries should be non-None - assert all( - req_id is not None for req_id in - self.input_batch.req_ids[:num_reqs]), "req_ids contains None" - req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) - model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, @@ -826,10 +851,12 @@ def load_model(self) -> None: @torch.inference_mode() def _dummy_run( self, - model: nn.Module, num_tokens: int, - kv_caches: List[torch.Tensor], + kv_caches: Optional[List[torch.Tensor]] = None, ) -> torch.Tensor: + model = self.model + if kv_caches is None: + kv_caches = self.kv_caches if self.is_multimodal_model: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] @@ -955,8 +982,7 @@ def profile_run(self) -> None: self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.model, self.max_num_tokens, - dummy_kv_caches) + hidden_states = self._dummy_run(self.max_num_tokens, dummy_kv_caches) logits = self.model.compute_logits(hidden_states, None) logits = logits[:self.max_num_tokens] # TODO(woosuk): Consider the memory usage of the sampler. @@ -982,8 +1008,8 @@ def capture_model(self) -> None: for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): - self._dummy_run(self.model, num_tokens, self.kv_caches) - self._dummy_run(self.model, num_tokens, self.kv_caches) + self._dummy_run(num_tokens) + self._dummy_run(num_tokens) end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index bd40112aea5e..a8cf0aec3f17 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -9,12 +9,14 @@ import vllm.envs as envs from vllm.config import ParallelConfig, VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform +from vllm.utils import GiB_bytes from vllm.v1.core.scheduler import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput @@ -77,6 +79,23 @@ def __init__( else: self.profiler = None + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self): if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -110,7 +129,17 @@ def init_device(self): self.model_runner = GPUModelRunner(self.vllm_config, self.device) def load_model(self) -> None: - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() @torch.inference_mode() def determine_available_memory(self) -> int: @@ -167,9 +196,28 @@ def get_kv_cache_spec(self) -> KVCacheSpec: def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" - self.model_runner.initialize_kv_cache(kv_cache_config) + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index abbf6450ab7f..4b429b67b36f 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -144,9 +144,7 @@ def __init__(self, runner: "CPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner - self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls @@ -156,10 +154,17 @@ def __init__(self, self.device = self.runner.device self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper self.enable_lora = self.runner.lora_config is not None + if self.runner.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + attn_backend = self.runner.attn_backend + self.att_metadata_builder = attn_backend.get_builder_cls()(self) + + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) - self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( - self) + self.att_metadata_builder.prepare() def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -431,6 +436,7 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ _model_input_cls: Type[TModelInputForCPU] _builder_cls: Type[ModelInputForCPUBuilder] + builder: ModelInputForCPUBuilder def __init__( self, @@ -477,6 +483,10 @@ def __init__( # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) @@ -522,10 +532,10 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) - builder.set_seq_group_list(seq_group_metadata_list) + self.builder.prepare(finished_requests_ids) + self.builder.set_seq_group_list(seq_group_metadata_list) - return builder.build() # type: ignore + return self.builder.build() # type: ignore # sampler property will be used by spec_decode_worker @property diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 9401241073c7..3c570212625c 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -130,7 +130,6 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: - assert execute_model_req is not None # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -144,7 +143,8 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks: + if (log_graph_compilation or log_cpu_fallbacks) and \ + execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b71580172a0d..9c6ec7818385 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -455,17 +455,12 @@ def __init__(self, self.enable_prompt_adapter = (self.runner.prompt_adapter_config is not None) self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper - self.finished_requests_ids = finished_requests_ids - self.decode_only = True - - # Intermediate data (data in CPU before going to GPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForGPUBuilder.InterDataForSeqGroup] = [] # Attention metadata inputs. - self.attn_metadata_builder = self.attn_backend.make_metadata_builder( - weakref.proxy(self)) + if self.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + weakref.proxy(self)) # Engine/Model configurations. self.chunked_prefill_enabled = ( @@ -477,6 +472,21 @@ def __init__(self, self.block_aligned_sliding_window = \ self.sliding_window_blocks * self.block_size + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.finished_requests_ids = finished_requests_ids + + # if the current batch is decode-only. + # will be set to False if there is any non-decode request. + self.decode_only = True + + # Intermediate data (data in CPU before going to GPU) for + # the current sequence group. + self.inter_data_list: List[ + ModelInputForGPUBuilder.InterDataForSeqGroup] = [] + + self.attn_metadata_builder.prepare() + def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, seq_group_metadata: SequenceGroupMetadata): """Compute context length, sequence length and tokens @@ -991,6 +1001,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ _model_input_cls: Type[TModelInputForGPU] _builder_cls: Type[ModelInputForGPUBuilder] + builder: ModelInputForGPUBuilder def __init__( self, @@ -1091,6 +1102,10 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: @@ -1196,13 +1211,13 @@ def _prepare_model_input_tensors( If cuda graph is required, this API automatically pads inputs. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + self.builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) + self.builder.add_seq_group(seq_group_metadata) - builder.reset_cached_inter_data() + self.builder.reset_cached_inter_data() - return builder.build() # type: ignore + return self.builder.build() # type: ignore @contextmanager def set_in_profile_run(self): @@ -1214,13 +1229,19 @@ def set_in_profile_run(self): @torch.inference_mode() def profile_run(self) -> None: + max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + self._dummy_run(max_num_batched_tokens, max_num_seqs) + + def _dummy_run(self, + max_num_batched_tokens: int, + max_num_seqs: int = 1) -> None: with self.set_in_profile_run(): # Enable top-k sampling to reflect the accurate memory usage. sampling_params = \ SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request @@ -1453,13 +1474,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: for virtual_engine in range( self.parallel_config.pipeline_parallel_size): # Only rank 0 should print progress bar during capture - capture_sizes = ( - tqdm( - self.vllm_config.compilation_config.capture_sizes, - desc="Capturing CUDA graph shapes", - ) if get_tensor_model_parallel_rank() == 0 else - self.vllm_config.compilation_config.capture_sizes) - for batch_size in capture_sizes: + cudagraph_capture_sizes = (tqdm( + self.vllm_config.compilation_config. + cudagraph_capture_sizes, + desc="Capturing CUDA graph shapes", + ) if get_tensor_model_parallel_rank() == 0 else + self.vllm_config.compilation_config. + cudagraph_capture_sizes) + for batch_size in cudagraph_capture_sizes: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index acfd6d0b03f6..aef4bdcdd4bf 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -200,6 +200,11 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]): """A builder to create ModelRunnerInputBase objects. """ + @abstractmethod + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + raise NotImplementedError + @abstractmethod def add_seq_group(self, seq_group_metadata): """TBA""" diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 918d9e31dcf3..c115dc1d2a4e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -9,6 +9,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_kv_transfer_initialized, ensure_model_parallel_initialized, init_distributed_environment, @@ -148,6 +149,23 @@ def stop_profile(self): else: self.profiler.stop() + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -179,7 +197,17 @@ def init_device(self) -> None: set_random_seed(self.model_config.seed) def load_model(self): - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() def save_sharded_state( self, @@ -298,7 +326,14 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._init_cache_engine() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self._init_cache_engine() self._warm_up_model() def _init_cache_engine(self): @@ -316,6 +351,18 @@ def _init_cache_engine(self): self.gpu_cache) def _warm_up_model(self) -> None: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) # Reset the seed to ensure that the random state is not affected by diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c6e6693c54f5..6eeb4aa17051 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -8,7 +8,8 @@ import torch import torch.nn as nn -from vllm.config import ObservabilityConfig, VllmConfig +from vllm.config import (ObservabilityConfig, VllmConfig, + set_current_vllm_config) from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -498,8 +499,11 @@ def __init__( group. """ self.rpc_rank = rpc_rank - self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None + # do not store this `vllm_config`, `init_worker` will set the final + # one. TODO: investigate if we can remove this field in + # `WorkerWrapperBase`, `init_cached_hf_modules` should be + # unnecessary now. if vllm_config.model_config is not None: # it can be None in tests trust_remote_code = vllm_config.model_config.trust_remote_code @@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Arguments are passed to the worker class constructor. """ kwargs = all_kwargs[self.rpc_rank] + self.vllm_config = kwargs.get("vllm_config", None) + assert self.vllm_config is not None, ( + "vllm_config is required to initialize the worker") enable_trace_function_call_for_thread(self.vllm_config) from vllm.plugins import load_general_plugins @@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: bytes) worker_class = cloudpickle.loads( self.vllm_config.parallel_config.worker_cls) - self.worker = worker_class(**kwargs) - assert self.worker is not None + with set_current_vllm_config(self.vllm_config): + # To make vLLM config available during worker initialization + self.worker = worker_class(**kwargs) + assert self.worker is not None def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index ffe8c3219dbe..b7b7b7227b22 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -113,7 +113,6 @@ def __init__(self, runner: "XPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner self.model_input_cls = self.runner._model_input_cls self.attn_backend = self.runner.attn_backend @@ -121,6 +120,10 @@ def __init__(self, self.block_size = self.runner.block_size self.device = self.runner.device + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -410,6 +413,8 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: with DeviceMemoryProfiler() as m: self.model = get_model(vllm_config=self.vllm_config) @@ -519,7 +524,8 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + builder = self.builder + builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: builder.add_seq_group(seq_group_metadata)