diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 32bd34c431c8..4d01a314adc4 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -426,7 +426,7 @@ main() { pip install -U transformers - pip install -r requirements-dev.txt + pip install -r requirements/dev.txt which genai-perf # check storage diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 731b71b6124e..706737e70d04 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -57,7 +57,7 @@ while true; do done echo "--- Pulling container" -image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" +image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" @@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_rand.py \ --ignore=kernels/test_sampler.py \ --ignore=kernels/test_cascade_flash_attn.py \ - --ignore=kernels/test_mamba_mixer2.py" + --ignore=kernels/test_mamba_mixer2.py \ + --ignore=kernels/test_aqlm.py \ + --ignore=kernels/test_machete_mm.py \ + --ignore=kernels/test_mha_attn.py \ + --ignore=kernels/test_block_fp8.py \ + --ignore=kernels/test_permute_cols.py" fi #ignore certain Entrypoints tests diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 2ead1f51ed81..f6dad818ddc0 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -35,7 +35,7 @@ function cpu_tests() { # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install -r vllm/requirements-test.txt + pip install -r vllm/requirements/test.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 38449c806464..5e11acd953cc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -35,7 +35,7 @@ steps: fast_check: true no_gpu: True commands: - - pip install -r requirements-docs.txt + - pip install -r ../../requirements/docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - grep \"sig sig-object py\" build/html/api/inference_params.html @@ -78,6 +78,7 @@ steps: - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py @@ -115,6 +116,7 @@ steps: - tests/entrypoints/test_chat_utils - tests/entrypoints/offline_mode commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process @@ -146,8 +148,10 @@ steps: - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - - python3 ../examples/offline_inference/rlhf.py - - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py + - pushd ../examples/offline_inference + - python3 rlhf.py + - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -204,6 +208,7 @@ steps: - VLLM_USE_V1=1 pytest -v -s v1/engine - VLLM_USE_V1=1 pytest -v -s v1/sample - VLLM_USE_V1=1 pytest -v -s v1/worker + - VLLM_USE_V1=1 pytest -v -s v1/structured_output - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py # TODO: accuracy does not match, whether setting diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 573675d67f86..3ecbd64a973b 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -1,5 +1,5 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} +{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %} {% set default_working_dir = "vllm/tests" %} {% set hf_home = "/root/.cache/huggingface" %} @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" depends_on: ~ commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ." + - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ." - "docker push {{ docker_image_amd }}" key: "amd-build" env: @@ -27,7 +27,15 @@ steps: depends_on: - "amd-build" agents: +{% if step.amd_gpus and step.amd_gpus==8%} queue: amd_gpu +{% elif step.amd_gpus and step.amd_gpus==4%} + queue: amd_gpu +{% elif step.amd_gpus and step.amd_gpus==2%} + queue: amd_gpu +{% else%} + queue: amd_gpu +{% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: diff --git a/.github/mergify.yml b/.github/mergify.yml index e41107ae0a01..54f56210b286 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -36,6 +36,21 @@ pull_request_rules: add: - frontend +- name: label-multi-modality + description: Automatically apply multi-modality label + conditions: + - or: + - files~=^vllm/multimodal/ + - files~=^tests/multimodal/ + - files~=^tests/models/multimodal/ + - files~=^tests/models/*/audio_language/ + - files~=^tests/models/*/vision_language/ + - files=tests/models/test_vision.py + actions: + label: + add: + - multi-modality + - name: label-structured-output description: Automatically apply structured-output label conditions: diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index f0a4e4baf1ae..2bb7b726194d 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -5,7 +5,7 @@ python_executable=python3 # Update paths # Install requirements -$python_executable -m pip install -r requirements-rocm.txt +$python_executable -m pip install -r requirements/rocm.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 diff --git a/.gitignore b/.gitignore index 89dab8f13bab..e40752f4dea0 100644 --- a/.gitignore +++ b/.gitignore @@ -197,7 +197,7 @@ _build/ hip_compat.h # Benchmark dataset -benchmarks/*.json +benchmarks/**/*.json # Linting actionlint diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23a38d49638f..074ac9d122bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,8 +44,8 @@ repos: rev: 0.6.2 hooks: - id: pip-compile - args: [requirements-test.in, -o, requirements-test.txt] - files: ^requirements-test\.(in|txt)$ + args: [requirements/test.in, -o, requirements/test.txt] + files: ^requirements/test\.(in|txt)$ - repo: local hooks: - id: mypy-local diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 284196bc2d27..2781ec223b66 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -18,4 +18,4 @@ formats: [] # Optionally declare the Python requirements required to build your docs python: install: - - requirements: docs/requirements-docs.txt + - requirements: requirements/docs.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 11ecc5491997..557540e203f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") @@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build AllSpark kernels if we are building for at least some compatible archs. cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") - if (ALLSPARK_ARCHS) + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS) set(ALLSPARK_SRCS "csrc/quantization/gptq_allspark/allspark_repack.cu" "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") @@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") else() message(STATUS "Not building AllSpark kernels as no compatible archs found" - " in CUDA target architectures") + " in CUDA target architectures, or CUDA not >= 12.0") endif() + + set(SCALED_MM_3X_ARCHS) # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) + # CUDA 12.0 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") - message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " "later if you intend on running FP8 quantized models on " "Hopper.") else() - message(STATUS "Not building scaled_mm_c3x as no compatible archs found " + message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found " "in CUDA target architectures") endif() + endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) + # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require + # CUDA 12.8 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" + ) + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or " + "later if you intend on running FP8 quantized models on " + "Blackwell.") + else() + message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found " + "in CUDA target architectures") + endif() endif() # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) + # require CUDA 12.2 or later (and only work on Hopper and Blackwell). + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") - message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " "if you intend on running FP8 sparse quantized models on Hopper.") @@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" - "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" - ) + "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") @@ -534,6 +561,7 @@ define_gpu_extension_target( COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} USE_SABI 3 WITH_SOABI) @@ -557,7 +585,7 @@ set_gencode_flags_for_srcs( CUDA_ARCHS "${CUDA_ARCHS}") if(VLLM_GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) set(MARLIN_MOE_SRC "csrc/moe/marlin_kernels/marlin_moe_kernel.h" diff --git a/Dockerfile b/Dockerfile index 63314b906f15..ff4a0839f6e0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,22 +14,21 @@ ARG PYTHON_VERSION=3.12 ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive -# Install Python and other dependencies -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install uv +# Install minimal dependencies and uv +RUN apt-get update -y \ + && apt-get install -y ccache git curl wget sudo \ + && curl -LsSf https://astral.sh/uv/install.sh | sh + +# Add uv to PATH +ENV PATH="/root/.local/bin:$PATH" +# Create venv with specified Python and activate by placing at the front of path +ENV VIRTUAL_ENV="/opt/venv" +RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV} +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels @@ -47,21 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace -# install build and runtime dependencies - # arm64 (GH200) build follows the practice of "use existing pytorch" build, # we need to install torch and torchvision from the nightly builds first, # pytorch will not appear as a vLLM dependency in all of the following steps # after this step RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ + uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ fi -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt +COPY requirements/common.txt requirements/common.txt +COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-cuda.txt + uv pip install -r requirements/cuda.txt # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -79,15 +76,19 @@ FROM base AS build ARG TARGETPLATFORM # install build dependencies -COPY requirements-build.txt requirements-build.txt +COPY requirements/build.txt requirements/build.txt + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-build.txt + uv pip install -r requirements/build.txt COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -124,6 +125,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" != "1" ]; then \ + # Clean any existing CMake artifacts + rm -rf .deps && \ + mkdir -p .deps && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi @@ -143,11 +147,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ #################### DEV IMAGE #################### FROM base as dev -COPY requirements-lint.txt requirements-lint.txt -COPY requirements-test.txt requirements-test.txt -COPY requirements-dev.txt requirements-dev.txt +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +COPY requirements/lint.txt requirements/lint.txt +COPY requirements/test.txt requirements/test.txt +COPY requirements/dev.txt requirements/dev.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-dev.txt + uv pip install -r requirements/dev.txt #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### @@ -163,23 +171,22 @@ ARG TARGETPLATFORM RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment -# Install Python and other dependencies -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install uv +# Install minimal dependencies and uv +RUN apt-get update -y \ + && apt-get install -y ccache git curl wget sudo vim \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \ + && curl -LsSf https://astral.sh/uv/install.sh | sh + +# Add uv to PATH +ENV PATH="/root/.local/bin:$PATH" +# Create venv with specified Python and activate by placing at the front of path +ENV VIRTUAL_ENV="/opt/venv" +RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV} +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -193,13 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # after this step RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ fi # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system dist/*.whl --verbose + uv pip install dist/*.whl --verbose # If we need to build FlashInfer wheel before its release: # $ export FLASHINFER_ENABLE_AOT=1 @@ -214,9 +221,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl RUN --mount=type=cache,target=/root/.cache/uv \ -. /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \ + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples @@ -224,9 +230,9 @@ COPY examples examples # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed -COPY requirements-build.txt requirements-build.txt +COPY requirements/build.txt requirements/build.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-build.txt + uv pip install -r requirements/build.txt #################### vLLM installation IMAGE #################### @@ -237,17 +243,21 @@ FROM vllm-base AS test ADD . /vllm-workspace/ +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-dev.txt + uv pip install -r requirements/dev.txt # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -e tests/vllm_test_utils + uv pip install -e tests/vllm_test_utils # enable fast downloads from hf (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system hf_transfer + uv pip install hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER 1 # Copy in the v1 package for testing (it isn't distributed yet) @@ -265,12 +275,16 @@ RUN mv vllm test_docs/ # base openai image with additional requirements, for any subsequent openai-style images FROM vllm-base AS vllm-openai-base +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.arm b/Dockerfile.arm index 093ee2209222..bad093684239 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -26,18 +26,18 @@ WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-arm AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.cpu b/Dockerfile.cpu index ebe226cf6d14..08a4e188f4c1 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -29,18 +29,18 @@ WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-1 AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.hpu b/Dockerfile.hpu index 66cf68c32f2c..48211c88f872 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -4,7 +4,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-hpu.txt +RUN pip install -v -r requirements/hpu.txt ENV no_proxy=localhost,127.0.0.1 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 27658d836d98..067645906366 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -36,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN python3 -m pip install -U \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements-neuron.txt + -r requirements/neuron.txt ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 32bcbfa9cc16..445c70ab89d4 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -16,7 +16,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN python3 -m pip install -U pip # install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt # build vLLM with OpenVINO backend RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index c4c1f3e35797..c5ca20d76e3e 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -6,7 +6,7 @@ ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev -# Some packages in requirements-cpu are installed here +# Some packages in requirements/cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # Currently these may not be available for venv or pip directly RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=cache,target=/root/.cache/pip \ RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements-cpu.txt \ + -r requirements/cpu.txt \ xformers uvloop==0.20.0 RUN --mount=type=bind,source=.git,target=.git \ diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 009e929ebace..d2c2d3b14678 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -38,14 +38,14 @@ FROM fetch_vllm AS build_vllm ARG USE_CYTHON # Build vLLM RUN cd vllm \ - && python3 -m pip install -r requirements-rocm.txt \ + && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ && python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / -COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples @@ -60,7 +60,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ && pip uninstall -y vllm \ && pip install *.whl @@ -99,7 +99,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \ # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ && pip uninstall -y vllm \ && pip install *.whl diff --git a/Dockerfile.s390x b/Dockerfile.s390x new file mode 100644 index 000000000000..5a84dc12d8f7 --- /dev/null +++ b/Dockerfile.s390x @@ -0,0 +1,152 @@ +# Base UBI image for s390x architecture +ARG BASE_UBI_IMAGE_TAG=9.5-1736404155 +ARG PYTHON_VERSION=3.12 +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base + +# Install basic dependencies +ARG PYTHON_VERSION +ENV PYTHON_VERSION=${PYTHON_VERSION} + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Install development utilities +RUN microdnf install -y \ + which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ + libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake && \ + microdnf clean all + +# Python Installation +FROM base AS python-install +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +ENV PYTHON_VERSION=${PYTHON_VERSION} +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all + +FROM python-install AS pyarrow + +# Build Apache Arrow +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/apache/arrow.git && \ + cd arrow/cpp && \ + mkdir release && cd release && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DARROW_PYTHON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_ORC=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_JSON=ON \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + .. && \ + make -j$(nproc) && \ + make install && \ + cd ../../python && \ + export PYARROW_PARALLEL=4 && \ + export ARROW_BUILD_TYPE=release && \ + uv pip install -r requirements/build.txt && \ + python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel + +FROM python-install AS numa-build +# Install numactl (needed for numa.h dependency) +WORKDIR /tmp +RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \ + tar -xvzf v2.0.16.tar.gz && \ + cd numactl-2.0.16 && \ + ./autogen.sh && \ + ./configure && \ + make + +# Set include path +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" + +FROM python-install AS rust +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . "$CARGO_HOME/env" && \ + rustup default stable && \ + rustup show + +FROM python-install AS torch-vision +# Install torchvision +ARG TORCH_VERSION=2.7.0.dev20250304 +ARG TORCH_VISION_VERSION=v0.20.1 +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/pytorch/vision.git && \ + cd vision && \ + git checkout $TORCH_VISION_VERSION && \ + uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ + python setup.py bdist_wheel + +# Final build stage +FROM python-install AS vllm-cpu +ARG PYTHON_VERSION + +# Set correct library path for torch and numactl +ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH" +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" +ENV UV_LINK_MODE=copy +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +COPY . /workspace/vllm +WORKDIR /workspace/vllm + +RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \ + make -C /numactl install + +# Install dependencies, including PyTorch and Apache Arrow +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ + --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ + sed -i '/^torch/d' requirements/build.txt && \ + ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ + VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ + uv pip install -v \ + $ARROW_WHL_FILE \ + $VISION_WHL_FILE \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + --index-strategy unsafe-best-match \ + -r requirements/build.txt \ + -r requirements/cpu.txt + +# Build and install vllm +RUN --mount=type=cache,target=/root/.cache/uv \ + VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + uv pip install "$(echo dist/*.whl)[tensorizer]" + +# setup non-root user for vllm +RUN umask 002 && \ + useradd --uid 2000 --gid 0 vllm && \ + mkdir -p /home/vllm && \ + chmod g+rwx /home/vllm + +COPY LICENSE /licenses/vllm.md +COPY examples/*.jinja /app/data/template/ + +USER 2000 +WORKDIR /home/vllm + +# Set the default entrypoint +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/Dockerfile.tpu b/Dockerfile.tpu index e268b3947666..960dc8e9ed9b 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -19,7 +19,7 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - -r requirements-tpu.txt + -r requirements/tpu.txt RUN python3 setup.py develop # install development dependencies (for testing) diff --git a/Dockerfile.xpu b/Dockerfile.xpu index a374f20d7d94..530809bcd4df 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -25,12 +25,12 @@ RUN apt-get update -y && \ wget WORKDIR /workspace/vllm -COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt -COPY requirements-common.txt /workspace/vllm/requirements-common.txt +COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt +COPY requirements/common.txt /workspace/vllm/requirements/common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - -r requirements-xpu.txt + -r requirements/xpu.txt RUN git clone https://github.com/intel/pti-gpu && \ cd pti-gpu/sdk && \ diff --git a/MANIFEST.in b/MANIFEST.in index 82be639ef4d7..82fd22b845f0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,9 @@ include LICENSE -include requirements-common.txt -include requirements-cuda.txt -include requirements-rocm.txt -include requirements-neuron.txt -include requirements-cpu.txt +include requirements/common.txt +include requirements/cuda.txt +include requirements/rocm.txt +include requirements/neuron.txt +include requirements/cpu.txt include CMakeLists.txt recursive-include cmake * diff --git a/README.md b/README.md index f22a1f9c5c80..49d6d525161d 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,17 @@ Easy, fast, and cheap LLM serving for everyone --- -We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9) +Weโ€™re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**! + +Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend). + +๐Ÿ‘‰ **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion! --- *Latest News* ๐Ÿ”ฅ +- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 158705769b5e..d53428d219e7 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -27,7 +27,6 @@ class RequestFuncInput: output_len: int model: str model_name: Optional[str] = None - best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None @@ -58,7 +57,6 @@ async def async_request_tgi( async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: params = { - "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. @@ -130,7 +128,6 @@ async def async_request_trt_llm( async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, "text_input": request_func_input.prompt, @@ -195,7 +192,6 @@ async def async_request_deepspeed_mii( ) -> RequestFuncOutput: async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - assert request_func_input.best_of == 1 payload = { "prompt": request_func_input.prompt, @@ -249,7 +245,6 @@ async def async_request_openai_completions( if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, - "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py new file mode 100644 index 000000000000..30fffdda491d --- /dev/null +++ b/benchmarks/benchmark_dataset.py @@ -0,0 +1,667 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This module defines a framework for sampling benchmark requests from various +datasets. Each dataset subclass of BenchmarkDataset must implement sample +generation. Supported dataset types include: + - ShareGPT + - Random (synthetic) + - Sonnet + - BurstGPT + - HuggingFace + - VisionArena + +TODO: Implement CustomDataset to parse a JSON file and convert its contents into +SampleRequest instances, similar to the approach used in ShareGPT. +""" + +import base64 +import io +import json +import random +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass +from functools import cache +from typing import Any, Optional, Union + +import numpy as np +import pandas as pd +from datasets import load_dataset +from PIL import Image +from transformers import PreTrainedTokenizerBase + +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer + +# ----------------------------------------------------------------------------- +# Data Classes +# ----------------------------------------------------------------------------- + + +@dataclass +class SampleRequest: + """ + Represents a single inference request for benchmarking. + """ + + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + lora_request: Optional[LoRARequest] = None + + +# ----------------------------------------------------------------------------- +# Benchmark Dataset Base Class +# ----------------------------------------------------------------------------- + + +class BenchmarkDataset(ABC): + DEFAULT_SEED = 0 + + # num_requests has default 1000 in both the benchmark_serving.py and + # benchmark_throughput.py + + def __init__( + self, + dataset_path: Optional[str] = None, + random_seed: int = DEFAULT_SEED, + ) -> None: + """ + Initialize the BenchmarkDataset with an optional dataset path and random + seed. Args: + dataset_path (Optional[str]): Path to the dataset. If None, it + indicates that a default or random dataset might be used. + random_seed (int): Seed value for reproducible shuffling or + sampling. Defaults to DEFAULT_SEED. + """ + self.dataset_path = dataset_path + # Set the random seed, ensuring that a None value is replaced with the + # default seed. + self.random_seed = (random_seed + if random_seed is not None else self.DEFAULT_SEED) + self.data = None + + def load_data(self) -> None: + """ + Load data from the dataset path into self.data. + + This method must be overridden by subclasses since the method to load + data will vary depending on the dataset format and source. + + Raises: + NotImplementedError: If a subclass does not implement this method. + """ + # TODO (jenniferzhao): add support for downloading data + raise NotImplementedError( + "load_data must be implemented in subclasses.") + + def get_random_lora_request( + self, + tokenizer: PreTrainedTokenizerBase, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + ) -> tuple[Optional[LoRARequest], AnyTokenizer]: + """ + Optionally select a random LoRA request and return its associated + tokenizer. + + This method is used when LoRA parameters are provided. It randomly + selects a LoRA based on max_loras and retrieves a cached tokenizer for + that LoRA if available. Otherwise, it returns the base tokenizer. + + Args: + tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no + LoRA is selected. max_loras (Optional[int]): The maximum number of + LoRAs available. If None, LoRA is not used. lora_path + (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA + is not used. + + Returns: + tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first + element is a LoRARequest (or None if not applicable) and the second + element is the tokenizer associated with the LoRA request (or the + base tokenizer). + """ + if max_loras is None or lora_path is None: + return None, tokenizer + + # Generate a random LoRA ID in the range [1, max_loras]. + lora_id = random.randint(1, max_loras) + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + if lora_id not in lora_tokenizer_cache: + lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) + # Return lora_request and the cached tokenizer if available; otherwise, + # return the base tokenizer + return lora_request, lora_tokenizer_cache[lora_id] or tokenizer + + @abstractmethod + def sample(self, tokenizer: PreTrainedTokenizerBase, + num_requests: int) -> list[SampleRequest]: + """ + Abstract method to generate sample requests from the dataset. + + Subclasses must override this method to implement dataset-specific logic + for generating a list of SampleRequest objects. + + Args: + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + for processing the dataset's text. + num_requests (int): The number of sample requests to generate. + + Returns: + list[SampleRequest]: A list of sample requests generated from the + dataset. + """ + raise NotImplementedError("sample must be implemented in subclasses.") + + +# ----------------------------------------------------------------------------- +# Utility Functions and Global Caches +# ----------------------------------------------------------------------------- + + +def is_valid_sequence( + prompt_len: int, + output_len: int, + min_len: int = 4, + max_prompt_len: int = 1024, + max_total_len: int = 2048, + skip_min_output_len_check: bool = False, +) -> bool: + """ + Validate a sequence based on prompt and output lengths. + + Default pruning criteria are copied from the original `sample_hf_requests` + and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as + from `sample_requests` in benchmark_throughput.py. + """ + # Check for invalid conditions + prompt_too_short = prompt_len < min_len + output_too_short = (not skip_min_output_len_check) and (output_len + < min_len) + prompt_too_long = prompt_len > max_prompt_len + combined_too_long = (prompt_len + output_len) > max_total_len + + # Return True if none of the invalid conditions are met + return not (prompt_too_short or output_too_short or prompt_too_long + or combined_too_long) + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +# Global cache for LoRA tokenizers. +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} + + +def process_image(image: Any) -> Mapping[str, Any]: + """ + Process a single image input and return a multimedia content dictionary. + + For a PIL.Image.Image input: + - Converts the image to RGB. + - Saves the image as a JPEG in-memory. + - Encodes the JPEG data as a base64 string. + - Returns a dictionary with the image as a base64 data URL. + + For a string input: + - Treats the string as a URL or file path. + - Prepends "file://" if the string doesn't start with "http://" or + "file://". + - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is neither a PIL.Image.Image nor a string. + """ + if isinstance(image, Image.Image): + image = image.convert("RGB") + with io.BytesIO() as image_data: + image.save(image_data, format="JPEG") + image_base64 = base64.b64encode( + image_data.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + if isinstance(image, str): + image_url = (image if image.startswith( + ("http://", "file://")) else f"file://{image}") + return {"type": "image_url", "image_url": {"url": image_url}} + + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image or str.") + + +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 1.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs) -> list[SampleRequest]: + + vocab_size = tokenizer.vocab_size + + prefix_token_ids = (np.random.randint( + 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) + + input_low = int(input_len * range_ratio) + output_low = int(output_len * range_ratio) + + input_lens = np.random.randint(input_low, + input_len + 1, + size=num_requests) + output_lens = np.random.randint(output_low, + output_len + 1, + size=num_requests) + offsets = np.random.randint(0, vocab_size, size=num_requests) + + requests = [] + for i in range(num_requests): + inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % + vocab_size).tolist() + token_sequence = prefix_token_ids + inner_seq + prompt = tokenizer.decode(token_sequence) + total_input_len = prefix_len + int(input_lens[i]) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + )) + return requests + + +# ----------------------------------------------------------------------------- +# ShareGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ShareGPTDataset(BenchmarkDataset): + """ + Implements the ShareGPT dataset. Loads data from a JSON file and generates + sample requests based on conversation turns. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + with open(self.dataset_path, encoding="utf-8") as f: + self.data = json.load(f) + # Filter entries with at least two conversation turns. + self.data = [ + entry for entry in self.data + if "conversations" in entry and len(entry["conversations"]) >= 2 + ] + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + **kwargs) -> list: + samples: list = [] + for entry in self.data: + if len(samples) >= num_requests: + break + prompt, completion = entry["conversations"][0]["value"],\ + entry["conversations"][1]["value"] + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + new_output_len = (len(completion_ids) + if output_len is None else output_len) + if not is_valid_sequence(prompt_len, + new_output_len, + skip_min_output_len_check=output_len + is not None): + continue + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=new_output_len, + lora_request=lora_request, + )) + return samples + + +# ----------------------------------------------------------------------------- +# Sonnet Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SonnetDataset(BenchmarkDataset): + """ + Simplified implementation of the Sonnet dataset. Loads poem lines from a + text file and generates sample requests. Default values here copied from + `benchmark_serving.py` for the sonnet dataset. + """ + + DEFAULT_PREFIX_LEN = 200 + DEFAULT_INPUT_LEN = 550 + DEFAULT_OUTPUT_LEN = 150 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided.") + with open(self.dataset_path, encoding="utf-8") as f: + self.data = f.readlines() + + def sample(self, + tokenizer, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + return_prompt_formatted: bool = False, + **kwargs) -> list: + # Calculate average token length for a poem line. + tokenized_lines = [tokenizer(line).input_ids for line in self.data] + avg_len = sum(len(tokens) + for tokens in \ + tokenized_lines) / len(tokenized_lines) + + # Build the base prompt. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_msg = [{"role": "user", "content": base_prompt}] + base_fmt = tokenizer.apply_chat_template(base_msg, + add_generation_prompt=True, + tokenize=False) + base_offset = len(tokenizer(base_fmt).input_ids) + if input_len <= base_offset: + raise ValueError( + f"'input_len' must be higher than the base prompt length " + f"({base_offset}).") + + # Determine how many poem lines to use. + num_input_lines = round((input_len - base_offset) / avg_len) + num_prefix_lines = round((prefix_len - base_offset) / avg_len) + prefix_lines = self.data[:num_prefix_lines] + + samples = [] + for _ in range(num_requests): + extra_lines = random.choices(self.data, + k=num_input_lines - num_prefix_lines) + prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" + msg = [{"role": "user", "content": prompt}] + prompt_formatted = tokenizer.apply_chat_template( + msg, add_generation_prompt=True, tokenize=False) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + samples.append( + SampleRequest( + prompt=prompt_formatted + if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + return samples + + +# ----------------------------------------------------------------------------- +# BurstGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BurstGPTDataset(BenchmarkDataset): + """ + Implements the BurstGPT dataset. Loads data from a CSV file and generates + sample requests based on synthetic prompt generation. Only rows with Model + "GPT-4" and positive response tokens are used. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self, ): + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + df = pd.read_csv(self.dataset_path) + # Filter to keep only GPT-4 rows. + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove failed requests (where Response tokens is 0 or less). + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Sample the desired number of rows. + self.data = gpt4_df + + def _sample_loaded_data(self, num_requests: int) -> list: + if num_requests <= len(self.data): + data = self.data.sample(n=num_requests, + random_state=self.random_seed) + else: + data = self.data.sample( + n=num_requests, + random_state=self.random_seed, + replace=True, + ) + # Convert the dataframe to a list of lists. + return data.values.tolist() + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + **kwargs) -> list[SampleRequest]: + samples = [] + data = self._sample_loaded_data(num_requests=num_requests) + for i in range(num_requests): + input_len = int(data[i][2]) + output_len = int(data[i][3]) + lora_req, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + vocab_size = tokenizer.vocab_size + # Generate a synthetic prompt: a list of token IDs computed as (i + + # j) modulo vocab_size. + token_ids = [(i + j) % vocab_size for j in range(input_len)] + prompt = tokenizer.decode(token_ids) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=output_len, + lora_request=lora_req, + )) + return samples + + +# ----------------------------------------------------------------------------- +# HuggingFace Dataset Implementation +# ----------------------------------------------------------------------------- + + +class HuggingFaceDataset(BenchmarkDataset): + """ + Dataset class for processing a HuggingFace dataset with conversation data + and optional images. + """ + DEFAULT_NUM_REQUESTS = 1000 + + def __init__( + self, + dataset_split: str, + dataset_subset: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided for loading data.") + + self.data = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=True, + ) + + if "conversations" not in self.data.features: + raise ValueError("HF Dataset must have a 'conversations' column.") + + # Shuffle and filter examples with at least 2 conversations. + self.data = self.data.shuffle(seed=self.random_seed).filter( + lambda x: len(x["conversations"]) >= 2) + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + **kwargs) -> list: + sampled_requests = [] + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer, lora_path=lora_path, max_loras=max_loras) + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len): + continue + + mm_content = process_image( + item["image"]) if "image" in item else None + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + lora_request=lora_request, + )) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Vision Arena Dataset Implementation +# ----------------------------------------------------------------------------- + + +class VisionArenaDataset(BenchmarkDataset): + """ + Vision Arena Dataset. + """ + + DEFAULT_OUTPUT_LEN = 128 + DEFAULT_NUM_REQUESTS = 1000 + VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1" + + def __init__( + self, + dataset_split: str, + dataset_subset: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + + if self.dataset_path != self.VISION_ARENA_DATASET_PATH: + raise ValueError(f"Only support Vision Arena dataset.\ + This data path {self.dataset_path} is not valid.") + if self.dataset_subset is None and self.dataset_split != "train": + raise ValueError("Dataset split must be 'train'.") + + self.load_data() + + def load_data(self) -> None: + dataset = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=True, + ) + self.data = dataset.shuffle(seed=self.random_seed) + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs) -> list: + # TODO (jenniferzhao): Add support for offline benchmark sampling + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0][0]["content"] + prompt_len = len(tokenizer(prompt).input_ids) + mm_content = process_image(item["images"][0]) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + return sampled_requests diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py deleted file mode 100644 index 2e0f6c6b5d20..000000000000 --- a/benchmarks/benchmark_guided.py +++ /dev/null @@ -1,507 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""Benchmark guided decoding throughput.""" -import argparse -import dataclasses -import json -import os -import random -import time - -import datasets -import pandas as pd -import uvloop -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) -from vllm.sampling_params import GuidedDecodingParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -@dataclasses.dataclass -class SampleRequest: - """A class representing a single inference request for benchmarking. - - Attributes: - prompt: The input text prompt for the model. - multi_modal_data: Optional dictionary containing multi-modal data (e.g. - images). - prompt_len: The length of the prompt in tokens. - expected_output_len: The expected length of the output in tokens. - """ - prompt: str - prompt_len: int - expected_output_len: int - schema: dict - structure_type: str = 'json' - completion: str = None - - -def run_vllm(requests: list[SampleRequest], - engine_args: EngineArgs, - n: int, - guided_decoding_rate: float = 1.0, - warmup: bool = False) -> float: - from vllm import LLM, SamplingParams - llm = LLM(**vars(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len >= ( - request.prompt_len + request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") - - # Add the requests to the engine. - prompts: list[str] = [] - sampling_params: list[SamplingParams] = [] - # create a list containing random selected true or false - guided_decoding_req_idx = random.sample( - range(len(requests)), int(len(requests) * guided_decoding_rate)) - - if warmup: - print(">>>>> Running warmup prompt, for the first 5") - # We setup the first 5 requests to warmup FSM - # if using xgrammar dataset, we will skip warmup - warmup_requests = requests[:5] - for i, request in enumerate(warmup_requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams(json=request.schema) - if guided_decoding_rate > 0 else None, - )) - llm.generate(prompts, sampling_params, use_tqdm=False) - - print(">>>>> Benchmark started...") - prompts = [] - sampling_params = [] - for i, request in enumerate(requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams( - **{request.structure_type: request.schema}) - if i in guided_decoding_req_idx else None, - )) - - start = time.perf_counter() - outputs = llm.generate(prompts, sampling_params, use_tqdm=False) - ret = [] - for output, request in zip(outputs, requests): - generated_text = output.outputs[0].text - ret.append({ - "generated": generated_text, - "expected": request.completion - }) - end = time.perf_counter() - return end - start, ret - - -async def run_vllm_async( - requests: list[SampleRequest], - engine_args: AsyncEngineArgs, - n: int, - guided_decoding_rate: float = 1.0, - warmup: bool = False, - disable_frontend_multiprocessing: bool = False) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing) as llm: - - assert all( - llm.model_config.max_model_len >= (request.prompt_len + - request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") - - # Add the requests to the engine. - prompts: list[str] = [] - sampling_params: list[SamplingParams] = [] - guided_decoding_req_idx = random.sample( - range(len(requests)), int(len(requests) * guided_decoding_rate)) - - if warmup: - print(">>>>>> Running warmup prompt, for the first 5") - # We setup the first 5 requests to warmup FSM - # if using xgrammar dataset, we will skip warmup - warmup_requests = requests[:5] - for i, request in enumerate(warmup_requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams( - json=request.schema) - if guided_decoding_rate > 0 else None, - )) - generators = [] - for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): - generator = llm.generate(prompt, sp, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - - print(">>>>> Benchmark started...") - prompts = [] - sampling_params = [] - for i, request in enumerate(requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams(json=request.schema) - if i in guided_decoding_req_idx else None, - )) - - generators = [] - start_time = [] - latencies = [] - start = time.perf_counter() - for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): - generator = llm.generate(prompt, sp, request_id=f"test{i}") - generators.append(generator) - start_time.append(time.perf_counter()) - latencies.append([]) - all_gens = merge_async_iterators(*generators) - generated_texts = [''] * len(requests) - async for i, res in all_gens: - generated_texts[i] = res.outputs[0].text - lat = time.perf_counter() - start_time[i] - latencies[i].append(lat) - ret = [{ - 'generated': gt, - 'expected': req.completion - } for gt, req in zip(generated_texts, requests)] - end = time.perf_counter() - first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) - next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 - for lat in latencies]) - return end - start, ret, (first_latency, next_latency) - - -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> list[SampleRequest]: - if args.dataset == 'json': - if args.json_schema_path is None: - dir_path = os.path.dirname(os.path.realpath(__file__)) - args.json_schema_path = os.path.join(dir_path, - "structured_schemas", - "structured_schema_1.json") - with open(args.json_schema_path) as f: - schema = json.load(f) - prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "grammar": - schema = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ - """ - prompt = "Generate an SQL query to show the 'username' \ - and 'email' from the 'users' table." - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "regex": - regex = r"\w+@\w+\.com\n" - args.regex = regex - prompt = "Generate an email address for Alan Turing, \ - who works in Enigma. End in .com and new line. \ - Example result: alan.turing@enigma.com\n" - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=regex, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "choice": - choice = ["Positive", "Negative"] - args.choice = choice - prompt = "Classify this sentiment: vLLM is wonderful!" - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=choice, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "xgrammar_bench": - args.warmup = False - requests: list[SampleRequest] = [] - dataset = datasets.load_dataset("NousResearch/json-mode-eval", - split="train") - print(f"dataset has {len(dataset)} entries") - len_dataset = len(dataset) - for data_point_idx in range(args.num_prompts): - idx = data_point_idx - while idx >= len_dataset: - idx -= len_dataset - schema = dataset["schema"][idx] - prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], - tokenize=False) - input_len = len(tokenizer(prompt).input_ids) - completion = dataset["completion"][idx] - - requests.append( - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - completion=completion)) - - return requests - - -def evaluate(ret, args): - - def _eval_correctness_json(expected, actual): - # extract json string from string using regex - import re - actual = actual.replace('\n', '').replace(' ', '').strip() - try: - actual = re.search(r'\{.*\}', actual).group() - actual = json.loads(actual) - except Exception: - return False - - return True - - def _eval_correctness_choice(expected, actual): - return actual in args.choice - - def _eval_correctness_regex(expected, actual): - import re - return re.match(args.regex, actual) is not None - - def _eval_correctness(expected, actual): - if args.structure_type == 'json': - return _eval_correctness_json(expected, actual) - elif args.structure_type == 'regex': - return _eval_correctness_regex(expected, actual) - elif args.structure_type == 'choice': - return _eval_correctness_choice(expected, actual) - else: - return None - - scores = [] - for res in ret: - score = _eval_correctness(res['expected'], res['generated']) - res['correctness'] = score - scores.append(score) - - not_none_scores = [score for score in scores if score is not None] - - return (sum(not_none_scores) / len(not_none_scores) * - 100) if len(not_none_scores) > 0 else None - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - - # async engine is working for 'regex', 'choice' and 'grammar' - if args.dataset == 'grammar': - args.structure_type = 'grammar' - args.async_engine = False - elif args.dataset == 'regex': - args.structure_type = 'regex' - args.async_engine = False - elif args.dataset == 'choice': - args.structure_type = 'choice' - args.async_engine = False - else: - args.structure_type = 'json' - - if args.no_guided_decoding: - args.guided_decoding_ratio = 0 - if args.save_results: - result_file_name = f'{args.guided_decoding_ratio}guided' - result_file_name += f"_{args.model.split('/')[-1]}" - result_file_name += f"_{args.dataset}" - result_file_name += f"_{args.num_prompts}" - result_file_name += f"_out{args.output_len}" - result_file_name += f"_async{args.async_engine}" - result_file_name += f"_warmup{args.warmup}" - result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" - result_file_name += ".txt" - else: - result_file_name = None - - # Synthesize a prompt with the given input length. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) - requests = sample_requests(tokenizer, args) - - if args.async_engine: - engine_args = AsyncEngineArgs.from_cli_args(args) - elapsed_time, ret, (first_latency, next_latency) = uvloop.run( - run_vllm_async(requests, engine_args, args.n, - args.guided_decoding_ratio, args.warmup, - args.disable_frontend_multiprocessing)) - else: - engine_args = EngineArgs.from_cli_args(args) - elapsed_time, ret = run_vllm(requests, engine_args, args.n, - args.guided_decoding_ratio, args.warmup) - first_latency, next_latency = None, None - - score = evaluate(ret, args) - total_num_tokens = sum(request.prompt_len + request.expected_output_len - for request in requests) - total_output_tokens = sum(request.expected_output_len - for request in requests) - if first_latency is not None: - latency_breakdown = "\nFirst token latency(msecs):\n" - latency_breakdown += f"{first_latency.describe()}" - latency_breakdown += "\nNext token latency(msecs):\n" - latency_breakdown += f"{next_latency.describe()}" - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s", - f"Correct rate is {score} %", - f"{latency_breakdown if first_latency is not None else ''}") - - # Output JSON results if specified - if args.output_json or result_file_name: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "total_output_tokens": total_output_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", - "output_tokens_per_second": - f"{total_output_tokens / elapsed_time:.2f}", - "correct_rate(%)": score - } - results = {"outputs": ret, **results} - if first_latency is not None: - results["first_token_latency(msecs)"] = first_latency.describe( - ).to_dict() - results["next_token_latency(msecs)"] = next_latency.describe( - ).to_dict() - if args.output_json: - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - elif result_file_name: - with open(result_file_name, "w") as f: - json.dump(results, f, indent=4) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser(description="Benchmark guided decoding.") - parser = AsyncEngineArgs.add_cli_args(parser) - - parser.add_argument("--output-len", - type=int, - default=512, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument( - "--dataset", - default='json', - choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) - parser.add_argument("--json_schema_path", - type=str, - default=None, - help="Path to json schema.") - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--num-prompts", - type=int, - default=10, - help="Number of prompts to process.") - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the throughput results in JSON format.') - parser.add_argument("--async-engine", - action='store_true', - default=False, - help="Use vLLM async engine rather than LLM class.") - parser.add_argument("--no-guided-decoding", - action='store_true', - default=False, - help="Whether to disable JSON decoding or not.") - parser.add_argument("--guided-decoding-ratio", - type=float, - default=1.0, - help="Ratio of Guided Decoding requests") - parser.add_argument("--disable-frontend-multiprocessing", - action='store_true', - default=False, - help="Disable decoupled async engine frontend.") - parser.add_argument("--warmup", - action="store_true", - default=False, - help="Run warmup prompts before benchmark.") - parser.add_argument("--save-results", - action="store_true", - default=False, - help="save output results.") - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - main(args) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d7f39f50f6ca..dfd9bb1e6a4d 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -52,6 +52,7 @@ def main(args: argparse.Namespace): top_p=1.0, ignore_eos=True, max_tokens=args.output_len, + detokenize=not args.disable_detokenize, ) print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, @@ -173,6 +174,12 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help="Path to save the latency results in JSON format.", ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index fba32520442f..4fff7a8fc8ed 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -194,7 +194,9 @@ def main(args): llm = LLM(**dataclasses.asdict(engine_args)) - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + sampling_params = SamplingParams(temperature=0, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize) print("Testing filtered requests") prompts = repeat_and_sort_requests(filtered_requests, @@ -243,6 +245,12 @@ def main(args): "subtract this length when filtering prompts. Only used " "when dataset-path is not provided.", ) + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 43b2c1b03323..76fe00ede249 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -23,7 +23,7 @@ def sample_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> list[tuple[str, int, int]]: +) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -71,6 +71,7 @@ def run_vllm( requests: list[tuple[str, int, int]], n: int, engine_args: EngineArgs, + disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) @@ -95,6 +96,7 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize=not disable_detokenize, )) start = time.perf_counter() @@ -121,7 +123,8 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + EngineArgs.from_cli_args(args), + args.disable_detokenize) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -174,6 +177,12 @@ def main(args: argparse.Namespace): type=str, default=None, help='Path to save the throughput results in JSON format.') + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 16ec0a4817a2..1dd01ca96867 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,25 +25,20 @@ """ import argparse import asyncio -import base64 import gc -import io import json import os import random import time import warnings -from collections.abc import AsyncGenerator, Collection +from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Optional import numpy as np -import pandas as pd from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput) -from datasets import load_dataset -from PIL.Image import Image from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -57,6 +52,9 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -92,325 +90,18 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] -def sample_sharegpt_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> list[tuple[str, int, int, None]]: - # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - - # Shuffle the dataset. - random.shuffle(dataset) - - # Filter out sequences that are too long or too short - filtered_dataset: list[tuple[str, int, int]] = [] - for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: - break - - # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or (fixed_output_len is None and output_len < 4): - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len, None)) - - return filtered_dataset - - -def sample_burstgpt_requests( - dataset_path: str, - num_requests: int, - random_seed: int, - tokenizer: PreTrainedTokenizerBase, -) -> list[tuple[str, int, int, None]]: - df = pd.read_csv(dataset_path) - gpt4_df = df[df["Model"] == "GPT-4"] - # Remove the failed requests (i.e., response length is 0) - gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] - # Randomly sample num_requests from the dataset - if num_requests <= len(gpt4_df): - gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) - else: - gpt4_df = gpt4_df.sample(n=num_requests, - random_state=random_seed, - replace=True) - # Convert the dataframe to a list of tuples - dataset = gpt4_df.values.tolist() - input_requests = [] - for i in range(num_requests): - input_len = int(dataset[i][2]) - output_len = int(dataset[i][3]) - prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size - for j in range(input_len)]) - input_requests.append((prompt, input_len, output_len, None)) - return input_requests - - -def sample_sonnet_requests( - dataset_path: str, - num_requests: int, - input_len: int, - output_len: int, - prefix_len: int, - tokenizer: PreTrainedTokenizerBase, -) -> list[tuple[str, str, int, int, None]]: - assert ( - input_len > prefix_len - ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." - - # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: - poem_lines = f.readlines() - - # Tokenize the poem lines. - poem_token_ids = tokenizer(poem_lines).input_ids - average_poem_len = sum( - len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) - - # Base prefix for all requests. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_message = [{ - "role": "user", - "content": base_prompt, - }] - base_prompt_formatted = tokenizer.apply_chat_template( - base_message, add_generation_prompt=True, tokenize=False) - base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) - - assert ( - input_len > base_prompt_offset - ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." - num_input_lines = round( - (input_len - base_prompt_offset) / average_poem_len) - - # First approximately `prefix_len` number of tokens in the - # prompt are fixed poem lines. - assert ( - prefix_len > base_prompt_offset - ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." - - num_prefix_lines = round( - (prefix_len - base_prompt_offset) / average_poem_len) - prefix_lines = poem_lines[:num_prefix_lines] - - # Sample the rest of lines per request. - sampled_requests: list[tuple[str, int, int]] = [] - for _ in range(num_requests): - num_lines_needed = num_input_lines - num_prefix_lines - sampled_lines = "".join(prefix_lines + - random.choices(poem_lines, k=num_lines_needed)) - - prompt = f"{base_prompt}{sampled_lines}" - message = [ - { - "role": "user", - "content": prompt, - }, - ] - prompt_formatted = tokenizer.apply_chat_template( - message, add_generation_prompt=True, tokenize=False) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - sampled_requests.append( - (prompt, prompt_formatted, prompt_len, output_len, None)) - - return sampled_requests - - -def sample_vision_arena_requests( - dataset, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: - sampled_requests: list[tuple[str, int, int, dict[str, - Collection[str]]]] = [] - for data in dataset: - if len(sampled_requests) == num_requests: - break - - prompt = data["turns"][0][0]['content'] - - prompt_token_ids = tokenizer(prompt).input_ids - if fixed_output_len is None: - # Default max output len is set to 128 - print("--hf-output-len is not provided. Using default value 128.") - fixed_output_len = 128 - - prompt_len = len(prompt_token_ids) - output_len = fixed_output_len - - assert isinstance( - data["images"][0], - Image), ("Input image format must be `PIL.Image.Image`, " - f"given {type(data['image'])}.") - image: Image = data["images"][0] - image = image.convert("RGB") - image_data = io.BytesIO() - image.save(image_data, format='JPEG') - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - mm_content = { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - } - - sampled_requests.append((prompt, prompt_len, output_len, mm_content)) - - return sampled_requests - - -def sample_hf_requests( - dataset_path: str, - dataset_subset: Optional[str], - dataset_split: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - random_seed: int, - fixed_output_len: Optional[int] = None, -) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: - - # Special case for vision_arena dataset - if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ - and dataset_subset is None: - assert dataset_split == "train" - dataset = load_dataset(dataset_path, - name=dataset_subset, - split=dataset_split, - streaming=True) - dataset = dataset.shuffle(seed=random_seed) - return sample_vision_arena_requests(dataset, num_requests, tokenizer, - fixed_output_len) - - dataset = load_dataset(dataset_path, - name=dataset_subset, - split=dataset_split, - streaming=True) - assert "conversations" in dataset.features, ( - "HF Dataset must have 'conversations' column.") - filter_func = lambda x: len(x["conversations"]) >= 2 - filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - sampled_requests: list[tuple[str, int, int, dict[str, - Collection[str]]]] = [] - for data in filtered_dataset: - if len(sampled_requests) == num_requests: - break - - # Tokenize the prompts and completions. - prompt = data["conversations"][0]["value"] - prompt_token_ids = tokenizer(prompt).input_ids - completion = data["conversations"][1]["value"] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if fixed_output_len is None and (prompt_len < 4 or output_len < 4): - # Prune too short sequences. - continue - if fixed_output_len is None and \ - (prompt_len > 1024 or prompt_len + output_len > 2048): - # Prune too long sequences. - continue - - if "image" in data and isinstance(data["image"], Image): - image: Image = data["image"] - image = image.convert("RGB") - image_data = io.BytesIO() - image.save(image_data, format='JPEG') - image_base64 = base64.b64encode( - image_data.getvalue()).decode("utf-8") - mm_content = { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - } - elif "image" in data and isinstance(data["image"], str): - if (data["image"].startswith("http://") or \ - data["image"].startswith("file://")): - image_url = data["image"] - else: - image_url = f"file://{data['image']}" - - mm_content = { - "type": "image_url", - "image_url": { - "url": image_url - }, - } - else: - mm_content = None - - sampled_requests.append((prompt, prompt_len, output_len, mm_content)) - - return sampled_requests - - -def sample_random_requests( - prefix_len: int, - input_len: int, - output_len: int, - num_prompts: int, - range_ratio: float, - tokenizer: PreTrainedTokenizerBase, -) -> list[tuple[str, int, int]]: - prefix_token_ids = np.random.randint(0, - tokenizer.vocab_size, - size=prefix_len).tolist() - - input_lens = np.random.randint( - int(input_len * range_ratio), - input_len + 1, - size=num_prompts, - ) - output_lens = np.random.randint( - int(output_len * range_ratio), - output_len + 1, - size=num_prompts, - ) - offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) - input_requests = [] - for i in range(num_prompts): - prompt = tokenizer.decode(prefix_token_ids + - [(offsets[i] + i + j) % tokenizer.vocab_size - for j in range(input_lens[i])]) - - input_requests.append((prompt, int(prefix_len + input_lens[i]), - int(output_lens[i]), None)) - - return input_requests - - async def get_request( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[tuple[str, int, int], None]: +) -> AsyncGenerator[SampleRequest, None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. Args: input_requests: - A list of input requests, each represented as a tuple. + A list of input requests, each represented as a SampleRequest. request_rate: The rate at which requests are generated (requests/s). burstiness (optional): @@ -422,7 +113,7 @@ async def get_request( in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ - input_requests = iter(input_requests) + input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( @@ -444,7 +135,7 @@ async def get_request( def calculate_metrics( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, @@ -475,7 +166,7 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i][1] + total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -558,19 +249,18 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], logprobs: Optional[int], - best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: list[str], - selected_percentiles: list[str], + selected_percentiles: list[float], ignore_eos: bool, goodput_config_dict: dict[str, float], max_concurrency: Optional[int], - lora_modules: Optional[list[str]], + lora_modules: Optional[Iterable[str]], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -578,12 +268,16 @@ async def benchmark( raise ValueError(f"Unknown backend: {backend}") print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0]) + test_prompt, test_prompt_len, test_output_len, test_mm_content = \ + input_requests[0].prompt, input_requests[0].prompt_len, \ + input_requests[0].expected_output_len, \ + input_requests[0].multi_modal_data + if backend != "openai-chat" and test_mm_content is not None: # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") + assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -592,7 +286,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) @@ -608,7 +301,8 @@ async def benchmark( if lora_modules: # For each input request, choose a LoRA module at random. lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))]) + [random.choice(lora_modules) \ + for _ in range(len(input_requests))]) if profile: print("Starting profiler...") @@ -619,7 +313,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) profile_output = await request_func(request_func_input=profile_input) @@ -655,7 +348,9 @@ async def limited_request_func(request_func_input, pbar): benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = request + prompt, prompt_len, output_len, mm_content = request.prompt, \ + request.prompt_len, request.expected_output_len, \ + request.multi_modal_data req_model_id, req_model_name = model_id, model_name if lora_modules: req_lora_module = next(lora_modules) @@ -668,7 +363,6 @@ async def limited_request_func(request_func_input, pbar): prompt_len=prompt_len, output_len=output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=mm_content, ignore_eos=ignore_eos) tasks.append( @@ -686,7 +380,6 @@ async def limited_request_func(request_func_input, pbar): prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -872,76 +565,72 @@ def main(args: argparse.Namespace): "Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") - elif args.dataset_name == "sharegpt": - input_requests = sample_sharegpt_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, - ) - - elif args.dataset_name == "burstgpt": - input_requests = sample_burstgpt_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - random_seed=args.seed, - tokenizer=tokenizer, - ) - - elif args.dataset_name == "sonnet": - # Do not format the prompt, pass to message directly + if args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": - input_requests = sample_sonnet_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - ) - input_requests = [(prompt, prompt_len, output_len, None) - for prompt, prompt_formatted, prompt_len, - output_len, _ in input_requests] + input_requests = dataset.sample(num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False) else: - assert ( - tokenizer.chat_template or tokenizer.default_chat_template - ), "Tokenizer/model must have chat template for sonnet dataset." - input_requests = sample_sonnet_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - ) - input_requests = [(prompt_formatted, prompt_len, output_len, None) - for prompt, prompt_formatted, prompt_len, - output_len, _ in input_requests] + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample(num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True) elif args.dataset_name == "hf": - input_requests = sample_hf_requests( + # Choose between VisionArenaDataset + # and HuggingFaceDataset based on provided parameters. + dataset_class = (VisionArenaDataset if args.dataset_path + == VisionArenaDataset.VISION_ARENA_DATASET_PATH + and args.hf_subset is None else HuggingFaceDataset) + input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, dataset_split=args.hf_split, + ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, random_seed=args.seed, - fixed_output_len=args.hf_output_len, - ) - - elif args.dataset_name == "random": - input_requests = sample_random_requests( - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, - tokenizer=tokenizer, + output_len=args.hf_output_len, ) else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": + lambda: ShareGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": + lambda: BurstGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path). + sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": + lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ) + } + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err goodput_config_dict = check_goodput_args(args) # Avoid GC processing "static" data - reduce pause times. @@ -958,7 +647,6 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, - best_of=args.best_of, request_rate=args.request_rate, burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, @@ -983,7 +671,6 @@ def main(args: argparse.Namespace): result_json["backend"] = backend result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id - result_json["best_of"] = args.best_of result_json["num_prompts"] = args.num_prompts # Metadata @@ -1081,13 +768,6 @@ def main(args: argparse.Namespace): help= "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) - parser.add_argument( - "--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " - "returns the best one.", - ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( "--num-prompts", @@ -1312,4 +992,5 @@ def main(args: argparse.Namespace): "script chooses a LoRA module at random.") args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_structured_output.py similarity index 90% rename from benchmarks/benchmark_serving_guided.py rename to benchmarks/benchmark_serving_structured_output.py index 6c132d05f1b6..dccef9d96d05 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -r"""Benchmark online serving throughput with guided decoding. +r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: (vLLM OpenAI API server) @@ -9,12 +9,12 @@ ./launch_tgi_server.sh On the client side, run: - python benchmarks/benchmark_serving_guided.py \ + python benchmarks/benchmark_serving_structured_output.py \ --backend \ --model \ --dataset json \ - --guided-decoding-ratio 1.0 \ - --guided-decoding-backend xgrammar \ + --structured-output-ratio 1.0 \ + --structured-output-backend xgrammar \ --request-rate 10 \ --num-prompts 1000 @@ -24,11 +24,13 @@ """ import argparse import asyncio +import copy import dataclasses import json import os import random import time +import uuid import warnings from collections.abc import AsyncGenerator from dataclasses import dataclass @@ -52,6 +54,9 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from vllm.v1.structured_output.utils import ( + has_xgrammar_unsupported_json_features) + MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -106,24 +111,43 @@ class SampleRequest: def sample_requests(tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace) -> list[SampleRequest]: - if args.dataset == 'json': + if args.dataset == 'json' or args.dataset == 'json-unique': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) args.json_schema_path = os.path.join(dir_path, "structured_schemas", "structured_schema_1.json") + json_schemas = [] with open(args.json_schema_path) as f: schema = json.load(f) - prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") + + if args.dataset == 'json-unique': + json_schemas = [ + copy.deepcopy(schema) for _ in range(args.num_prompts) + ] + for i in range(len(json_schemas)): + json_schemas[i]["properties"][ + f"__optional_field_{uuid.uuid4()}"] = { + "type": + "string", + "description": + "An unique optional field to avoid cached schemas" + } + + def gen_prompt(index: int): + schema = json_schemas[index % len(json_schemas)] + return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + + def get_schema(index: int): + return json_schemas[index % len(json_schemas)] + requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, + SampleRequest(prompt=gen_prompt(i), + prompt_len=len(tokenizer(gen_prompt(i)).input_ids), expected_output_len=args.output_len, - schema=schema, + schema=get_schema(i), structure_type=args.structure_type) - for _ in range(args.num_prompts) + for i in range(args.num_prompts) ] elif args.dataset == "grammar": @@ -191,7 +215,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") - print(f"dataset has {len(dataset)} entries") + full_dataset_len = len(dataset) + + def _filter_func(item): + import json + schema = json.loads(item["schema"]) + return not has_xgrammar_unsupported_json_features(schema) + + dataset = dataset.filter(_filter_func) + num_filtered_out = full_dataset_len - len(dataset) + print(f"dataset has {len(dataset)} entries after filtering " + f"out {num_filtered_out} entries with unsupported features") len_dataset = len(dataset) for data_point_idx in range(args.num_prompts): idx = data_point_idx @@ -220,21 +254,21 @@ async def get_request( burstiness: float = 1.0, ) -> AsyncGenerator[tuple[int, SampleRequest], None]: """ - Asynchronously generates requests at a specified rate + Asynchronously generates requests at a specified rate with OPTIONAL burstiness. - + Args: - input_requests: + input_requests: A list of input requests, each represented as a tuple. - request_rate: + request_rate: The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. + burstiness (optional): + The burstiness factor of the request generation. Only takes effect when request_rate is not inf. Default value is 1, which follows a Poisson process. Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ input_requests = iter(input_requests) @@ -378,8 +412,8 @@ async def benchmark( selected_percentiles: list[str], ignore_eos: bool, max_concurrency: Optional[int], - guided_decoding_ratio: float, - guided_decoding_backend: str, + structured_output_ratio: float, + structured_output_backend: str, goodput_config_dict: Optional[dict[str, float]] = None, ): if backend in ASYNC_REQUEST_FUNCS: @@ -391,16 +425,18 @@ def prepare_extra_body(request) -> dict: extra_body = {} # Add the schema to the extra_body extra_body[request.structure_type] = request.schema - # Add the specific guided_decoding_backend - extra_body["guided_decoding_backend"] = guided_decoding_backend + # Add the specific structured_output_backend + extra_body["guided_decoding_backend"] = structured_output_backend return extra_body print("Starting initial single prompt test run...") - guided_decoding_req_idx = random.sample( + structured_output_req_idx = random.sample( range(len(input_requests)), - int(len(input_requests) * guided_decoding_ratio)) + int(len(input_requests) * structured_output_ratio)) test_request = input_requests[0] + test_req_extra_body = (prepare_extra_body(test_request) + if 0 in structured_output_req_idx else None) test_input = RequestFuncInput( model=model_id, prompt=test_request.prompt, @@ -408,7 +444,7 @@ def prepare_extra_body(request) -> dict: prompt_len=test_request.prompt_len, output_len=test_request.expected_output_len, ignore_eos=ignore_eos, - extra_body=prepare_extra_body(test_request), + extra_body=test_req_extra_body, ) test_output = await request_func(request_func_input=test_input) if not test_output.success: @@ -427,7 +463,7 @@ def prepare_extra_body(request) -> dict: prompt_len=test_request.prompt_len, output_len=test_request.expected_output_len, ignore_eos=ignore_eos, - extra_body=prepare_extra_body(test_request), + extra_body=test_req_extra_body, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -465,7 +501,7 @@ async def limited_request_func(request_func_input, pbar): async for i, request in get_request(input_requests, request_rate, burstiness): extra_body = prepare_extra_body( - request) if i in guided_decoding_req_idx else None + request) if i in structured_output_req_idx else None request_func_input = RequestFuncInput( model=model_id, prompt=request.prompt, @@ -708,10 +744,10 @@ def main(args: argparse.Namespace): else: args.structure_type = 'guided_json' - if args.no_guided_decoding: - args.guided_decoding_ratio = 0 + if args.no_structured_output: + args.structured_output_ratio = 0 if args.save_results: - result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name = f'{args.structured_output_ratio}guided' result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" @@ -744,8 +780,8 @@ def main(args: argparse.Namespace): ], ignore_eos=args.ignore_eos, max_concurrency=args.max_concurrency, - guided_decoding_ratio=args.guided_decoding_ratio, - guided_decoding_backend=args.guided_decoding_backend, + structured_output_ratio=args.structured_output_ratio, + structured_output_backend=args.structured_output_backend, goodput_config_dict=goodput_config_dict, )) @@ -806,10 +842,12 @@ def main(args: argparse.Namespace): default="/v1/completions", help="API endpoint.", ) - parser.add_argument( - "--dataset", - default='json', - choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--dataset", + default='json', + choices=[ + 'json', 'json-unique', 'grammar', 'regex', + 'choice', 'xgrammar_bench' + ]) parser.add_argument("--json_schema_path", type=str, default=None, @@ -943,19 +981,20 @@ def main(args: argparse.Namespace): "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " "and the blog: https://hao-ai-lab.github.io/blogs/distserve") - parser.add_argument("--no-guided-decoding", + parser.add_argument("--no-structured-output", action='store_true', default=False, help="Whether to disable JSON decoding or not.") - parser.add_argument("--guided-decoding-ratio", + parser.add_argument("--structured-output-ratio", type=float, default=1.0, - help="Ratio of Guided Decoding requests") - parser.add_argument("--guided-decoding-backend", - type=str, - choices=["outlines", "lm-format-enforcer", "xgrammar"], - default="xgrammar", - help="Backend to use for guided decoding") + help="Ratio of Structured Outputs requests") + parser.add_argument( + "--structured-output-backend", + type=str, + choices=["outlines", "lm-format-enforcer", "xgrammar", "json-unique"], + default="xgrammar", + help="Backend to use for structured outputs") args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index aabce64ff776..7e6556733b28 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,13 +6,14 @@ import os import random import time -from functools import cache -from typing import Any, Optional +import warnings +from typing import Any, Optional, Union import torch import uvloop +from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest, + ShareGPTDataset, SonnetDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -20,154 +21,17 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.inputs import TextPrompt +from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.utils import FlexibleArgumentParser, merge_async_iterators -@dataclasses.dataclass -class SampleRequest: - """A class representing a single inference request for benchmarking. - - Attributes: - prompt: The input text prompt for the model. - prompt_len: The length of the prompt in tokens. - expected_output_len: The expected length of the output in tokens. - multi_modal_data: Optional dictionary containing multi-modal data (e.g. - images). - lora_request: Optional LoRARequest specifying the LoRA to use. - """ - prompt: str - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[MultiModalDataDict] = None - lora_request: Optional[LoRARequest] = None - - -def _get_prompt_for_image_model(question: str, *, model: str) -> str: - """Prepend and append special tokens around the question to form a prompt. - - Args: - question: The input question text to wrap with special tokens - model: The name of the model being used, to determine which special - tokens to add - - Returns: - The formatted prompt string with appropriate special tokens for the - model - - Raises: - ValueError: If an unsupported model name is provided - """ - model = model.lower() - if "pixtral" in model: - return f"[INST]{question}\n[IMG][/INST]" - raise ValueError(f"Unsupported model {model}") - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} - - -def get_random_lora_request( - args: argparse.Namespace -) -> tuple[LoRARequest, Optional[AnyTokenizer]]: - global lora_tokenizer_cache - lora_id = random.randint(1, args.max_loras) - lora_request = LoRARequest(lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(args.lora_path)) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - return lora_request, lora_tokenizer_cache[lora_id] - - -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> list[SampleRequest]: - - dataset_path: str = args.dataset - num_requests: int = args.num_prompts - fixed_output_len: Optional[int] = args.output_len - model: str = args.model - if fixed_output_len is not None and fixed_output_len < 4: - raise ValueError("output_len too small") - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Shuffle the dataset. - random.shuffle(dataset) - - # Filter out sequences that are too long or too short - filtered_dataset: list[SampleRequest] = [] - for data in tqdm(dataset, - total=len(filtered_dataset), - desc="sampling requests"): - if len(filtered_dataset) == num_requests: - break - - # Only keep the first two turns of each conversation. - prompt = data["conversations"][0]["value"] - completion = data["conversations"][1]["value"] - - multi_modal_data: Optional[MultiModalDataDict] = None - if "image" in data: - multi_modal_data = multi_modal_data or {} - image_path = data["image"] - # TODO(vllm-project/vllm/issues/9778): Support multiple images. - assert isinstance(image_path, - str), "Only support single image input" - try: - multi_modal_data["image"] = Image.open(image_path).convert( - "RGB") - except FileNotFoundError: - # Ignore datapoint where asset is missing - continue - prompt = _get_prompt_for_image_model(question=prompt, model=model) - - request_tokenizer = tokenizer - lora_request: Optional[LoRARequest] = None - if args.enable_lora: - lora_request, lora_tokenizer = get_random_lora_request(args) - if lora_tokenizer: - request_tokenizer = lora_tokenizer - - # Tokenize the prompts and completions. - prompt_token_ids = request_tokenizer(prompt).input_ids - completion_token_ids = request_tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append( - SampleRequest(prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=multi_modal_data, - lora_request=lora_request)) - - return filtered_dataset - - def run_vllm( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) @@ -178,10 +42,13 @@ def run_vllm( "Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: list[TextPrompt] = [] + prompts: list[Union[TextPrompt, TokensPrompt]] = [] sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) sampling_params.append( @@ -191,6 +58,7 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: @@ -229,6 +97,7 @@ async def run_vllm_async( n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams @@ -242,11 +111,14 @@ async def run_vllm_async( " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: list[TextPrompt] = [] + prompts: list[Union[TextPrompt, TokensPrompt]] = [] sampling_params: list[SamplingParams] = [] lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) sampling_params.append( @@ -256,6 +128,7 @@ async def run_vllm_async( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) lora_requests.append(request.lora_request) @@ -282,6 +155,7 @@ def run_hf( n: int, max_batch_size: int, trust_remote_code: bool, + disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) @@ -321,8 +195,9 @@ def run_hf( use_cache=True, max_new_tokens=max_output_len, ) - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) pbar.update(len(batch)) # Clear the batch. @@ -369,56 +244,50 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, write_to_json(pt_file, pt_records) +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + "tokenizer": tokenizer, + "lora_path": args.lora_path, + "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + dataset_cls = RandomDataset + elif args.dataset_name == "sharegpt": + dataset_cls = ShareGPTDataset + elif args.dataset_name == "sonnet": + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + dataset_cls = SonnetDataset + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["return_prompt_formatted"] = True + elif args.dataset_name == "burstgpt": + dataset_cls = BurstGPTDataset + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + return dataset_cls(**common_kwargs).sample(**sample_kwargs) + + def main(args: argparse.Namespace): + if args.seed is None: + args.seed = 0 print(args) random.seed(args.seed) - # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) - if args.dataset is None: - vocab_size = tokenizer.vocab_size - requests = [] - for _ in range(args.num_prompts): - - request_tokenizer = tokenizer - lora_request: Optional[LoRARequest] = None - if args.enable_lora: - lora_request, lora_tokenizer = get_random_lora_request(args) - if lora_tokenizer: - request_tokenizer = lora_tokenizer - - # Synthesize a prompt with the given input length. - candidate_ids = [ - random.randint(0, vocab_size - 1) - for _ in range(args.input_len) - ] - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for _ in range(5): # Max attempts to correct - candidate_prompt = request_tokenizer.decode(candidate_ids) - tokenized_len = len(request_tokenizer.encode(candidate_prompt)) - - if tokenized_len == args.input_len: - break - - # Adjust length based on difference - diff = args.input_len - tokenized_len - if diff > 0: - candidate_ids.extend([ - random.randint(100, vocab_size - 100) - for _ in range(diff) - ]) - else: - candidate_ids = candidate_ids[:diff] - requests.append( - SampleRequest(prompt=candidate_prompt, - prompt_len=args.input_len, - expected_output_len=args.output_len, - lora_request=lora_request)) - else: - requests = sample_requests(tokenizer, args) - + requests = get_requests(args, tokenizer) is_multi_modal = any(request.multi_modal_data is not None for request in requests) if args.backend == "vllm": @@ -429,14 +298,17 @@ def main(args: argparse.Namespace): args.n, AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, + args.disable_detokenize, )) else: elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + EngineArgs.from_cli_args(args), + args.disable_detokenize) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.hf_max_batch_size, args.trust_remote_code) + args.hf_max_batch_size, args.trust_remote_code, + args.disable_detokenize) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) @@ -450,7 +322,7 @@ def main(args: argparse.Namespace): print("\033[91mWARNING\033[0m: Multi-modal request detected. The " "following metrics are not accurate because image tokens are not" " counted. See vllm-project/vllm/issues/9778 for details.") - # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. + # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") @@ -475,12 +347,23 @@ def main(args: argparse.Namespace): type=str, choices=["vllm", "hf", "mii"], default="vllm") - parser.add_argument("--dataset", + parser.add_argument("--dataset-name", + type=str, + choices=["sharegpt", "random", "sonnet", "burstgpt"], + help="Name of the dataset to benchmark on.", + default="sharegpt") + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]") + parser.add_argument("--dataset-path", type=str, default=None, - help="Path to the dataset. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]") + help="Path to the dataset") parser.add_argument("--input-len", type=int, default=None, @@ -515,6 +398,11 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)")) # LoRA parser.add_argument( "--lora-path", @@ -522,14 +410,35 @@ def main(args: argparse.Namespace): default=None, help="Path to the lora adapters to use. This can be an absolute path, " "a relative path, or a Hugging Face model identifier.") + parser.add_argument("--prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request." + "This is for the RandomDataset and SonnetDataset") + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for RandomDataSet.", + ) parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model - if args.dataset is None: - assert args.input_len is not None - assert args.output_len is not None + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next " + "release. Please use '--dataset-name' and " + "'--dataset-path' in the future runs.", + stacklevel=2) + args.dataset_path = args.dataset + if args.dataset is None and args.dataset_path is None: + # for random dataset, the default sampling setting is in + # benchmark_dataset.RandomDataset + print("When dataset is not set, it will default to random dataset") else: assert args.input_len is None if args.enable_lora: diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index d265c91bfeff..e12d74c01e43 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 5eaeec017053..3c4d6a6aa464 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, result = torch.nn.functional.linear(x, w) result *= scaling out_list.append(result) - torch.cat(out_list, dim=0) cat_result = torch.cat(out_list, dim=0) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 3fa57bd7b233..a661ea9d7e60 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -45,7 +45,6 @@ def terse_type_name(dt): torch.float16: "fp16", torch.int8: "int8", torch.float8_e4m3fn: "fp8", - torch.bfloat16: "bf16", torch.float: "float", torch.int: "int", }[dt] @@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, return lambda: ops.machete_mm( a=bt.a, - b_q=bt.w_q, + b_q=w_q, b_type=bt.wtype, b_group_scales=bt.w_g_s, b_group_zeros=w_g_zp, diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c862dec81fcc..9de8d5af6242 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import json import time +from contextlib import nullcontext from datetime import datetime from itertools import product from typing import Any, TypedDict @@ -40,6 +42,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a16: bool, num_iters: int = 100, + block_quant_shape: List[int] = None, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) @@ -81,8 +84,24 @@ def benchmark_config( dtype=torch.float32) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) if use_fp8_w8a8: - w1_scale = torch.randn(num_experts, dtype=torch.float32) - w2_scale = torch.randn(num_experts, dtype=torch.float32) + if block_quant_shape: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + E = num_experts + N = shard_intermediate_size // 2 + K = hidden_size + factor_for_scale = 1e-2 + n_tiles_w1 = (2 * N + block_n - 1) // block_n + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + k_tiles_w2 = (N + block_k - 1) // block_k + w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1), + dtype=torch.float32) * factor_for_scale + w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2), + dtype=torch.float32) * factor_for_scale + else: + w1_scale = torch.randn(num_experts, dtype=torch.float32) + w2_scale = torch.randn(num_experts, dtype=torch.float32) + a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) @@ -111,6 +130,7 @@ def run(): w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale, + block_shape=block_quant_shape, ) # JIT compilation & warmup @@ -175,7 +195,8 @@ def get_rocm_tuning_space(use_fp16): return param_ranges -def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]: +def get_configs_compute_bound(use_fp16, + block_quant_shape) -> list[dict[str, int]]: configs: list[BenchmarkConfig] = [] if current_platform.is_rocm(): @@ -204,17 +225,27 @@ def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]: for config_values in product(*values): config = dict(zip(keys, config_values)) configs.append(config) + + # Remove configs that are not compatible with fp8 block quantization + # BLOCK_SIZE_K must be a multiple of block_k + # BLOCK_SIZE_N must be a multiple of block_n + if block_quant_shape is not None and not use_fp16: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + for config in configs[:]: + if config["BLOCK_SIZE_K"] % block_k != 0 or config[ + "BLOCK_SIZE_N"] % block_n != 0: + configs.remove(config) return configs def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, - search_space, is_fp16): + search_space, is_fp16, topk): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, - is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, - is_fp16) + pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, + search_space, is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, + search_space, is_fp16) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -372,6 +403,7 @@ def tune( use_fp8_w8a8: bool, use_int8_w8a16: bool, search_space: list[dict[str, int]], + block_quant_shape: list[int], ) -> dict[str, int]: best_config = None best_time = float("inf") @@ -380,21 +412,24 @@ def tune( search_space = prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, search_space, - is_fp16) + is_fp16, topk) - with torch.cuda.device(self.device_id): + with torch.cuda.device(self.device_id) if current_platform.is_rocm( + ) else nullcontext(): for config in tqdm(search_space): try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=20) + kernel_time = benchmark_config( + config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20, + block_quant_shape=block_quant_shape) except triton.runtime.autotuner.OutOfResources: # Some configurations may be invalid and fail to compile. continue @@ -436,8 +471,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, - dtype: torch.dtype, use_fp8_w8a8: bool, - use_int8_w8a16: bool) -> None: + dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + block_quant_shape: List[int]) -> None: dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) @@ -445,7 +480,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. filename = get_config_file_name(num_experts, shard_intermediate_size // 2, - dtype_str) + dtype_str, block_quant_shape) print(f"Writing best config to {filename}...") with open(filename, "w") as f: @@ -455,7 +490,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, def main(args: argparse.Namespace): print(args) - + block_quant_shape = None config = AutoConfig.from_pretrained( args.model, trust_remote_code=args.trust_remote_code) if config.architectures[0] == "DbrxForCausalLM": @@ -474,6 +509,12 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + block_quant_shape = config.quantization_config['weight_block_size'] + elif config.architectures[0] == "Qwen2MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Default: Mixtral. E = config.num_local_experts @@ -511,27 +552,30 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: if args.tune: is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = get_configs_compute_bound(is_fp16) + search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() configs = _distribute( - "tune", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space) - for batch_size in batch_sizes]) + "tune", + [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, + use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape) + for batch_size in batch_sizes]) best_configs = { M: sort_config(config) for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16) + topk, dtype, use_fp8_w8a8, use_int8_w8a16, + block_quant_shape) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: outputs = _distribute( - "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16) - for batch_size in batch_sizes]) + "benchmark", + [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, + use_fp8_w8a8, use_int8_w8a16, block_quant_shape) + for batch_size in batch_sizes]) for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): print(f"Batch size: {batch_size}, config: {config}") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 5445f7baf1d9..e8e4db875b7d 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -177,7 +177,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 0ddea9390d77..b643897a60ee 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md new file mode 100644 index 000000000000..917e814010f8 --- /dev/null +++ b/benchmarks/kernels/deepgemm/README.md @@ -0,0 +1,129 @@ +# DeepSeek DeepGEMM Kernels Benchmark + +This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. + +Currently this just includes dense GEMMs and only works on Hopper GPUs. + +## Setup + +You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: + +``` +git clone --recursive https://github.com/deepseek-ai/DeepGEMM +cd DeepGEMM +python setup.py install +uv pip install -e . +``` + +## Usage + +``` +python benchmark_fp8_block_dense_gemm.py +INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. +===== STARTING FP8 GEMM BENCHMARK ===== +PyTorch version: 2.5.1+cu124 +CUDA version: 12.4 +Triton version: 3.1.0 +Using device: NVIDIA H100 80GB HBM3 +WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. + +===== PERFORMANCE COMPARISON ===== + +DeepGEMM Implementation: ++------+-------+-------+-----------+--------+--------+ +| m | n | k | Time (ฮผs) | TFLOPS | GB/s | ++------+-------+-------+-----------+--------+--------+ +| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 | +| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 | +| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 | +| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 | +| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 | +| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 | +| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 | +| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 | +| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 | +| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 | +| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 | +| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 | +| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 | +| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 | +| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 | +| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 | ++------+-------+-------+-----------+--------+--------+ + +vLLM Triton Implementation: ++------+-------+-------+-----------+--------+--------+--------------+ +| m | n | k | Time (ฮผs) | TFLOPS | GB/s | vs DeepGEMM | ++------+-------+-------+-----------+--------+--------+--------------+ +| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster | +| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower | +| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower | +| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower | +| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower | +| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower | +| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster | +| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster | +| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower | +| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster | +| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower | +| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower | +| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower | +| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower | +| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower | +| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower | ++------+-------+-------+-----------+--------+--------+--------------+ + +vLLM CUTLASS Implementation: ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| m | n | k | Time (ฮผs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster | +| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster | +| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster | +| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster | +| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster | +| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster | +| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster | +| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster | +| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster | +| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster | +| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster | +| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster | +| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster | +| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster | +| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster | +| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ + +===== AVERAGE PERFORMANCE ===== ++----------------+------------+----------+---------------+ +| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) | ++----------------+------------+----------+---------------+ +| DeepGEMM | 310.98 | 1052.10 | 0.11 | +| vLLM Triton | 144.30 | 715.60 | 0.23 | +| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 | ++----------------+------------+----------+---------------+ + +===== AVERAGE SPEEDUPS ===== ++-----------------------------+--------------+ +| Comparison | Speedup | ++-----------------------------+--------------+ +| DeepGEMM vs vLLM Triton | 1.71x faster | +| DeepGEMM vs vLLM CUTLASS | 0.94x slower | +| vLLM CUTLASS vs vLLM Triton | 1.84x faster | ++-----------------------------+--------------+ + +===== ACCURACY COMPARISON ===== ++----------------+-----------------------+ +| Implementation | Avg Diff vs Reference | ++----------------+-----------------------+ +| DeepGEMM | 0.000684 | +| vLLM Triton | 0.000684 | +| vLLM CUTLASS | 0.000684 | ++----------------+-----------------------+ +``` diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py new file mode 100644 index 000000000000..7892f126e7d6 --- /dev/null +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -0,0 +1,464 @@ +# SPDX-License-Identifier: Apache-2.0 +# fmt: off +# ruff: noqa: E501 +import time + +# Import DeepGEMM functions +import deep_gemm +import torch +import triton +from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor + +# Import vLLM functions +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, w8a8_block_fp8_matmul) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 +def per_token_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-token scaling.""" + assert x.dim() == 2 and x.size(1) % 128 == 0 + m, n = x.shape + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + return (x_view * (448.0 / x_amax.unsqueeze(2))).to( + torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-block scaling.""" + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + + +def benchmark_shape(m: int, + n: int, + k: int, + warmup: int = 100, + repeat: int = 10000, + verbose: bool = False) -> dict: + """Benchmark all implementations for a specific (m, n, k) shape.""" + if verbose: + print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") + + # Create test tensors + A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) + B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + + # Reference result in BF16 + torch.cuda.synchronize() + C_ref = A @ B.t() + + # Pre-quantize B for all implementations + # (weights can be pre-quantized offline) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + + # Block size configuration + block_size = [128, 128] + + # Pre-quantize A for all implementations + A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + A, block_size[1], column_major_scales=True) + + # === DeepGEMM Implementation === + def deepgemm_gemm(): + # A quantization is inside the loop as it depends on activations + # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( + # A, block_size[1]) + # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + (B_deepgemm, B_scale_deepgemm), + C_deepgemm) + return C_deepgemm + + # === vLLM Triton Implementation === + def vllm_triton_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + return w8a8_block_fp8_matmul(A_vllm, + B_vllm, + A_scale_vllm, + B_scale_vllm, + block_size, + output_dtype=torch.bfloat16) + + # === vLLM CUTLASS Implementation === + def vllm_cutlass_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + # A, block_size[1], column_major_scales=True) + return ops.cutlass_scaled_mm(A_vllm_cutlass, + B_vllm.T, + scale_a=A_scale_vllm_cutlass, + scale_b=B_scale_vllm.T, + out_dtype=torch.bfloat16) + + # Run correctness check first + if verbose: + print("Running correctness check...") + C_deepgemm = deepgemm_gemm() + C_vllm_triton = vllm_triton_gemm() + C_vllm_cutlass = vllm_cutlass_gemm() + + deepgemm_diff = calc_diff(C_deepgemm, C_ref) + vllm_triton_diff = calc_diff(C_vllm_triton, C_ref) + vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref) + + if verbose: + print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") + print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") + print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") + print("vLLM Triton vs DeepGEMM difference: " + f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") + print("vLLM CUTLASS vs DeepGEMM difference: " + f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") + + # Benchmark implementations + implementations = { + "DeepGEMM": deepgemm_gemm, + "vLLM Triton": vllm_triton_gemm, + "vLLM CUTLASS": vllm_cutlass_gemm + } + + benchmark_results = { + "shape": { + "m": m, + "n": n, + "k": k + }, + "implementations": {} + } + + for name, func in implementations.items(): + # Warmup + for _ in range(warmup): + func() + torch.cuda.synchronize() + + # Timing loop + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + func() + torch.cuda.synchronize() + end = time.time() + + # Calculate timing and TFLOPS + avg_time_ms = (end - start) / repeat * 1000 + avg_time_us = avg_time_ms * 1000 + tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12 + gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3) + + benchmark_results["implementations"][name] = { + "time_ms": avg_time_ms, + "time_us": avg_time_us, + "tflops": tflops, + "gb_s": gb_s, + "diff": { + "DeepGEMM": + 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), + "Reference": + deepgemm_diff if name == "DeepGEMM" else + (vllm_triton_diff + if name == "vLLM Triton" else vllm_cutlass_diff) + } + } + + if verbose: + print( + f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" + ) + + # Calculate speedups + baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] + for name, data in benchmark_results["implementations"].items(): + if name != "DeepGEMM": + speedup = baseline / data["time_ms"] + benchmark_results["implementations"][name][ + "speedup_vs_deepgemm"] = speedup + if verbose: + print(f"DeepGEMM is {1/speedup:.2f}x " + f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") + + vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ + "time_ms"] + vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ + "time_ms"] + cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time + benchmark_results["implementations"]["vLLM CUTLASS"][ + "speedup_vs_triton"] = cutlass_vs_triton + if verbose: + print( + f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " + f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton" + ) + + return benchmark_results + + +def format_table_row(values, widths): + """Format a row with specified column widths.""" + return "| " + " | ".join(f"{val:{w}}" + for val, w in zip(values, widths)) + " |" + + +def print_table(headers, rows, title=None): + """Print a table with headers and rows.""" + if title: + print(f"\n{title}") + + # Calculate column widths based on headers and data + widths = [ + max(len(str(h)), max(len(str(row[i])) for row in rows)) + for i, h in enumerate(headers) + ] + + # Create separator line + separator = "+-" + "-+-".join("-" * w for w in widths) + "-+" + + # Print table + print(separator) + print(format_table_row(headers, widths)) + print(separator) + for row in rows: + print(format_table_row(row, widths)) + print(separator) + + +def format_speedup(value): + """Format speedup value with indicator if it's faster or slower.""" + return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}" + + +def run_benchmarks(verbose: bool = False): + """Run benchmarks for a set of common shapes.""" + print("===== STARTING FP8 GEMM BENCHMARK =====") + + # Make sure we're using the GPU + if not torch.cuda.is_available(): + print("CUDA not available! Tests require GPU.") + return + + # Print system information + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA version: {torch.version.cuda}") + print(f"Triton version: {triton.__version__}") + print(f"Using device: {torch.cuda.get_device_name()}") + + # Enable TF32 for better performance + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # Set seeds for reproducibility + torch.manual_seed(42) + torch.cuda.manual_seed(42) + + # Define benchmark shapes (m, n, k) + shapes = [ + (8, 4096, 7168), + (8, 7168, 18432), + (8, 18432, 7168), + (64, 4096, 7168), + (64, 7168, 18432), + (64, 18432, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 18432), + (128, 18432, 7168), + (1024, 4096, 7168), + (1024, 18432, 7168), + (2048, 4096, 7168), + (4096, 4096, 7168), + ] + shapes = [ + # (64, 2112, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (64, 4096, 7168), + (64, 7168, 2048), + # (128, 2112, 7168), + (128, 24576, 1536), + (128, 32768, 512), + (128, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 2048), + # (4096, 2112, 7168), + (4096, 24576, 1536), + (4096, 32768, 512), + (4096, 7168, 16384), + (4096, 4096, 7168), + (4096, 7168, 2048), + ] + + all_results = [] + for m, n, k in shapes: + result = benchmark_shape(m, n, k, verbose=verbose) + all_results.append(result) + + # Print results in a nicely formatted table + print("\n===== PERFORMANCE COMPARISON =====") + + # Print DeepGEMM table + deepgemm_headers = ["m", "n", "k", "Time (ฮผs)", "TFLOPS", "GB/s"] + deepgemm_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["DeepGEMM"] + deepgemm_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" + ]) + + print_table(deepgemm_headers, + deepgemm_rows, + title="DeepGEMM Implementation:") + + # Print vLLM Triton table + triton_headers = [ + "m", "n", "k", "Time (ฮผs)", "TFLOPS", "GB/s", "vs DeepGEMM" + ] + triton_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM Triton"] + speedup = impl_data.get("speedup_vs_deepgemm", 1.0) + triton_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(speedup) + ]) + + print_table(triton_headers, + triton_rows, + title="vLLM Triton Implementation:") + + # Print vLLM CUTLASS table + cutlass_headers = [ + "m", "n", "k", "Time (ฮผs)", "TFLOPS", "GB/s", "vs DeepGEMM", + "vs Triton" + ] + cutlass_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM CUTLASS"] + vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) + vs_triton = impl_data.get("speedup_vs_triton", 1.0) + cutlass_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(vs_deepgemm), + format_speedup(vs_triton) + ]) + + print_table(cutlass_headers, + cutlass_rows, + title="vLLM CUTLASS Implementation:") + + # Calculate and print averages + print("\n===== AVERAGE PERFORMANCE =====") + + implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] + avg_metrics = { + impl: { + "tflops": 0, + "gb_s": 0, + "time_ms": 0 + } + for impl in implementations + } + + for result in all_results: + for impl in implementations: + impl_data = result["implementations"][impl] + avg_metrics[impl]["tflops"] += impl_data["tflops"] + avg_metrics[impl]["gb_s"] += impl_data["gb_s"] + avg_metrics[impl]["time_ms"] += impl_data["time_ms"] + + num_shapes = len(all_results) + avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"] + avg_rows = [] + + for impl in implementations: + avg_tflops = avg_metrics[impl]["tflops"] / num_shapes + avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes + avg_time = avg_metrics[impl]["time_ms"] / num_shapes + avg_rows.append([ + impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" + ]) + + print_table(avg_headers, avg_rows) + + # Calculate average speedups + avg_speedups = { + "DeepGEMM vs vLLM Triton": 0, + "DeepGEMM vs vLLM CUTLASS": 0, + "vLLM CUTLASS vs vLLM Triton": 0 + } + + for result in all_results: + deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] + vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] + vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ + "time_ms"] + + avg_speedups[ + "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time + avg_speedups[ + "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time + avg_speedups[ + "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time + + print("\n===== AVERAGE SPEEDUPS =====") + speedup_headers = ["Comparison", "Speedup"] + speedup_rows = [] + for comparison, total in avg_speedups.items(): + avg_speedup = total / num_shapes + status = "faster" if avg_speedup > 1 else "slower" + speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"]) + + print_table(speedup_headers, speedup_rows) + + # Average accuracy comparison + print("\n===== ACCURACY COMPARISON =====") + avg_diff = {impl: 0 for impl in implementations} + + for result in all_results: + for impl in implementations: + avg_diff[impl] += result["implementations"][impl]["diff"][ + "Reference"] + + diff_headers = ["Implementation", "Avg Diff vs Reference"] + diff_rows = [] + for impl in implementations: + diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"]) + + print_table(diff_headers, diff_rows) + + +if __name__ == "__main__": + run_benchmarks(verbose=False) diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh new file mode 100755 index 000000000000..8a777320f735 --- /dev/null +++ b/benchmarks/run_structured_output_benchmark.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Define the model to use +MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"} + +# Define the backend to use +BACKEND=${2:-"vllm"} + +# Define the dataset to use +DATASET=${3:-"xgrammar_bench"} + +# Define the guided decoding backend +GUIDED_BACKEND=${4:-"xgrammar"} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"} + +GUIDED_RATIO=${6:-0.5} + +# Create output directory if it doesn't exist +mkdir -p "$OUTPUT_DIR" + +# Define QPS values to test +QPS_VALUES=(70 60 50 25 20 15 10) + +# Common parameters +COMMON_PARAMS="--backend $BACKEND \ + --model $MODEL \ + --dataset $DATASET \ + --structured-output-backend $GUIDED_BACKEND \ + --structured-output-ratio $GUIDED_RATIO \ + --save-results \ + --result-dir $OUTPUT_DIR" + +echo "Starting structured output benchmark with model: $MODEL" +echo "Backend: $BACKEND" +echo "Dataset: $DATASET" +echo "Structured output backend: $GUIDED_BACKEND" +echo "Results will be saved to: $OUTPUT_DIR" +echo "----------------------------------------" + +# Run benchmarks with different QPS values +for qps in "${QPS_VALUES[@]}"; do + echo "Running benchmark with QPS: $qps" + + # Get git hash and branch for the filename + GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") + GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") + + # Construct filename for this run + FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" + + # Run the benchmark + python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ + --request-rate $qps \ + --result-filename "$FILENAME" \ + --port ${PORT:-8000} + + echo "Completed benchmark with QPS: $qps" + echo "----------------------------------------" +done + +echo "All benchmarks completed!" +echo "Results saved to: $OUTPUT_DIR" diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json index 6003698469e8..1bd189c9e704 100644 --- a/benchmarks/structured_schemas/structured_schema_1.json +++ b/benchmarks/structured_schemas/structured_schema_1.json @@ -1,113 +1,25 @@ { - "$schema": - "https://json-schema.org/draft/2020-12/schema", - "title": - "User Profile", - "type": - "object", + "type": "array", + "items": { + "type": "object", "properties": { - "userId": { - "type": "string", - "description": "Unique identifier for the user." - }, - "personalInfo": { - "type": "object", - "properties": { - "firstName": { - "type": "string", - "description": "The user's first name." - }, - "lastName": { - "type": "string", - "description": "The user's last name." - }, - "age": { - "type": "integer", - "minimum": 0, - "description": "The user's age." - }, - "phoneNumbers": { - "type": - "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["home", "work", "mobile"], - "description": "Type of phone number." - }, - "number": { - "type": "string", - "pattern": "^\\+?[1-9]\\d{1,14}$", - "description": "Phone number in E.164 format." - } - }, - "required": ["type", "number"] - }, - "description": - "List of phone numbers associated with the user." - } - }, - "required": ["firstName", "lastName"] - }, - "address": { - "type": "object", - "properties": { - "street": { - "type": "string", - "description": "Street address." - }, - "city": { - "type": "string", - "description": "City name." - }, - "state": { - "type": "string", - "description": "State or province." - }, - "postalCode": { - "type": "string", - "pattern": "^\\d{5}(-\\d{4})?$", - "description": "Postal code." - }, - "country": { - "type": "string", - "description": "Country name." - } - }, - "required": ["street", "city", "state", "postalCode", "country"] - }, - "preferences": { - "type": "object", - "properties": { - "newsletterSubscribed": { - "type": - "boolean", - "description": - "Indicates if the user is subscribed to the newsletter." - }, - "favoriteCategories": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of user's favorite categories." - } - }, - "required": ["newsletterSubscribed"] - }, - "accountStatus": { - "type": "string", - "enum": ["active", "inactive", "suspended"], - "description": "Current status of the user's account." - }, - "registrationDate": { - "type": "string", - "format": "date-time", - "description": "ISO 8601 formatted date-time of user registration." - } + "name": { "type": "string" }, + "race": { "type": "string" }, + "class": { "type": "string" }, + "level": { "type": "integer" }, + "background": { "type": "string" }, + "alignment": { "type": "string" }, + "backstory": { "type": "string" } }, - "required": - ["userId", "personalInfo", "address", "accountStatus", "registrationDate"] -} \ No newline at end of file + "required": [ + "name", + "race", + "class", + "level", + "background", + "alignment", + "backstory" + ] + } +} + diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 714abca2a5ff..ca2ffb1bc3c8 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -81,6 +81,7 @@ else() find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support + find_isa(${CPUINFO} "S390" S390_FOUND) endif() @@ -129,8 +130,16 @@ elseif (ASIMD_FOUND) elseif(APPLE_SILICON_FOUND) message(STATUS "Apple Silicon Detected") set(ENABLE_NUMA OFF) +elseif (S390_FOUND) + message(STATUS "S390 detected") + # Check for S390 VXE support + list(APPEND CXX_COMPILE_FLAGS + "-mvx" + "-mzvector" + "-march=native" + "-mtune=native") else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.") endif() # diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index ef6261fa6d9b..f2d01099097a 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade + GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn @@ -64,4 +64,4 @@ install( DESTINATION vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" -) \ No newline at end of file +) diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index b9764056e8a2..0257d8ff16ba 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -24,8 +24,8 @@ struct KernelVecType { template <> struct KernelVecType { -#ifdef __powerpc64__ - // Power architecture-specific vector types +#if defined(__powerpc64__) || defined(__s390x__) + // Power and s390x architecture-specific vector types using q_load_vec_type = vec_op::FP32Vec8; using k_load_vec_type = vec_op::FP32Vec16; using v_load_vec_type = vec_op::FP32Vec16; diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index a71815106133..17bbe04eef94 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -7,6 +7,9 @@ #elif defined(__POWER9_VECTOR__) // ppc implementation #include "cpu_types_vsx.hpp" +#elif defined(__s390x__) + // s390 implementation + #include "cpu_types_vxe.hpp" #elif defined(__aarch64__) // arm implementation #include "cpu_types_arm.hpp" diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp new file mode 100644 index 000000000000..ab8cbbbf4ec4 --- /dev/null +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -0,0 +1,480 @@ + +#ifndef CPU_TYPES_VXE_HPP +#define CPU_TYPES_VXE_HPP + +#include +#include +#include +namespace vec_op { + +#define vec_neg(a) (-(a)) +#define vec_add(a, b) ((a) + (b)) +#define vec_sub(a, b) ((a) - (b)) +#define vec_mul(a, b) ((a) * (b)) +#define vec_div(a, b) ((a) / (b)) +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) +#else + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F&& f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template +struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +typedef struct ss16x8x2_t { + __vector signed short val[2]; +} ss16x8x2_t; + +typedef struct ss16x8x4_t { + __vector signed short val[4]; +} ss16x8x4_t; + +typedef struct f32x4x2_t { + __vector float val[2]; +} f32x4x2_t; + +typedef struct f32x4x4_t { + __vector float val[4]; +} f32x4x4_t; + +struct FP32Vec8; +struct FP32Vec16; + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __vector signed short reg; + + explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {} + explicit BF16Vec8(const FP32Vec8&); + + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + ss16x8x2_t reg; + + explicit BF16Vec16(const void* ptr) { + // Load 256 bits in two parts + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); + } + + explicit BF16Vec16(const FP32Vec16&); + + void save(void* ptr) const { + // Save 256 bits in two parts + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); + } +}; + +const static __vector signed short zero = vec_splats((signed short)0); + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + ss16x8x4_t reg; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} + + explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} + + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __vector float reg; + float values[VEC_ELEM_NUM]; + }; + + __vector float reg; + + explicit FP32Vec4(float v) : reg(vec_splats(v)) {} + + explicit FP32Vec4() : reg(vec_splats(0.0f)) {} + + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} + + explicit FP32Vec4(__vector float data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + f32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x2_t reg; + + explicit FP32Vec8(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + } + + explicit FP32Vec8() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + } + + explicit FP32Vec8(const float* ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + } + + explicit FP32Vec8(f32x4x2_t data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + } + + explicit FP32Vec8(const BF16Vec8& v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::exp(ar.values[0]); + ret.val[0][1] = std::exp(ar.values[1]); + ret.val[0][2] = std::exp(ar.values[2]); + ret.val[0][3] = std::exp(ar.values[3]); + ret.val[1][0] = std::exp(ar.values[4]); + ret.val[1][1] = std::exp(ar.values[5]); + ret.val[1][2] = std::exp(ar.values[6]); + ret.val[1][3] = std::exp(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 tanh() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::tanh(ar.values[0]); + ret.val[0][1] = std::tanh(ar.values[1]); + ret.val[0][2] = std::tanh(ar.values[2]); + ret.val[0][3] = std::tanh(ar.values[3]); + ret.val[1][0] = std::tanh(ar.values[4]); + ret.val[1][1] = std::tanh(ar.values[5]); + ret.val[1][2] = std::tanh(ar.values[6]); + ret.val[1][3] = std::tanh(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 er() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::erf(ar.values[0]); + ret.val[0][1] = std::erf(ar.values[1]); + ret.val[0][2] = std::erf(ar.values[2]); + ret.val[0][3] = std::erf(ar.values[3]); + ret.val[1][0] = std::erf(ar.values[4]); + ret.val[1][1] = std::erf(ar.values[5]); + ret.val[1][2] = std::erf(ar.values[6]); + ret.val[1][3] = std::erf(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + } + + void save(float* ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + f32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x4_t reg; + + explicit FP32Vec16(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + reg.val[2] = vec_splats(v); + reg.val[3] = vec_splats(v); + } + + explicit FP32Vec16() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + reg.val[2] = vec_splats(0.0f); + reg.val[3] = vec_splats(0.0f); + } + + explicit FP32Vec16(const float* ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + reg.val[2] = vec_xl(32, ptr); + reg.val[3] = vec_xl(48, ptr); + } + + explicit FP32Vec16(f32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[2]; + reg.val[3] = data.reg.val[3]; + } + + explicit FP32Vec16(const FP32Vec4& data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + } + + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const BF16Vec16& v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); + reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); + reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); + } + + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + template + float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float result = 0; + const int start = idx * group_size; + unroll_loop( + [&result, &start, ar](int i) { result += ar.values[start + i]; }); + + return result; + } + + void save(float* ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + vec_xst(reg.val[2], 32, ptr); + vec_xst(reg.val[3], 48, ptr); + } +}; + +template +struct VecType { + using vec_type = void; +}; + +template +using vec_t = typename VecType::vec_type; + +template <> +struct VecType { + using vec_type = FP32Vec8; +}; + +template <> +struct VecType { + using vec_type = BF16Vec8; +}; + +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} + +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { + acc = acc + a * b; +} + +namespace c10 { +struct BFloat16 { + uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit + // value. +}; +} // namespace c10 + +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifndef __VEC_CLASS_FP_NAN + #define __VEC_CLASS_FP_NAN (1 << 6) +#endif + +const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; + +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + int cc; + __vector __bool int sel0 = + vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel1 = + vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc); + inp0 = vec_sel(inp0, nan, sel0) >> sh16; + inp1 = vec_sel(inp1, nan, sel1) >> sh16; + reg = (__vector signed short)vec_perm(inp0, inp1, omask); +} + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]); + __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]); + int cc; + __vector __bool int sel0 = + vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel1 = + vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel2 = + vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel3 = + vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc); + inp0 = vec_sel(inp0, nan, sel0) >> sh16; + inp1 = vec_sel(inp1, nan, sel1) >> sh16; + inp2 = vec_sel(inp2, nan, sel2) >> sh16; + inp3 = vec_sel(inp3, nan, sel3) >> sh16; + reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask); + reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask); +} + +inline void prefetch(const void* addr) { void __dcbt(const void* addr); } + +}; // namespace vec_op + +#endif \ No newline at end of file diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index 33b163783288..6751e7e55fc5 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -25,7 +25,7 @@ struct KernelVecType { template <> struct KernelVecType { -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__s390x__) // Power architecture-specific vector type using load_vec_type = vec_op::FP32Vec16; #else diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index 1a0cd45f4e20..0a812dc56a99 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -22,7 +22,7 @@ struct identity { T operator()(T lhs) const { return lhs; } }; -template +template struct TrivialEpilogue { private: using Accum = cutlass::epilogue::fusion::Sm90AccFetch; @@ -44,32 +44,30 @@ struct TrivialEpilogue { * This class provides the common load descriptors for the * ScaledEpilogue[...] classes */ -template +template struct ScaledEpilogueBase { protected: using Accum = cutlass::epilogue::fusion::Sm90AccFetch; template using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>>; + 0 /*Stages*/, TileShape, T, Stride, Int<0>, Int<0>>>; template using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>>; + 0 /*Stages*/, TileShape, T, Stride, Int<1>, Int<0>>>; // Don't want to support nullptr by default template using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T, - Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + 0 /*Stages*/, TileShape, T, T, Stride, Int<0>, Int<0>>, + 128 / sizeof_bits_v, EnableNullPtr>; // Don't want to support nullptr by default template using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T, - Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + 0 /*Stages*/, TileShape, T, T, Stride, Int<1>, Int<0>>, + 128 / sizeof_bits_v, EnableNullPtr>; // This utility function constructs the arguments for the load descriptors // from a tensor. It can handle both row and column, as well as row/column or @@ -116,11 +114,11 @@ struct ScaledEpilogueBase { the A and B operands respectively. These scales may be either per-tensor or per row or column. */ -template +template struct ScaledEpilogue - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -160,11 +158,11 @@ struct ScaledEpilogue * The bias tensor must be per-output channel. * ScaleA and ScaleB can be per-tensor or per-token/per-channel. */ -template +template struct ScaledEpilogueBias - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -203,11 +201,11 @@ struct ScaledEpilogueBias * bias is a column vector instead of a row vector. Useful e.g. if we are * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels. */ -template +template struct ScaledEpilogueColumnBias - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -249,11 +247,11 @@ struct ScaledEpilogueColumnBias * * This epilogue also supports bias, which remains per-channel. */ -template +template struct ScaledEpilogueBiasAzp - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -314,11 +312,11 @@ struct ScaledEpilogueBiasAzp * * This epilogue also supports bias, which remains per-channel. */ -template +template struct ScaledEpilogueBiasAzpToken - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index 928a9500cbb0..d922a3349e1e 100644 --- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -402,7 +402,7 @@ struct CollectiveMma< // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, - Layout>{}, Layout>{}); // (1,1,1) + Layout>{}, Layout>{}); // (1,1,1) TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, Layout>{}, Layout>{}); // (1,1,1) ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh index 69a3f64cb0b0..26de32ce2b16 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh @@ -16,6 +16,7 @@ #include "cutlass/gemm/kernel/gemm_universal.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/util/packed_stride.hpp" #include "core/math.hpp" #include "cutlass_extensions/common.hpp" @@ -64,22 +65,28 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, EpilogueArgs&&... epilogue_params) { using ElementAB = typename Gemm::ElementAB; + using ElementC = typename Gemm::ElementC; using ElementD = typename Gemm::ElementD; using GemmKernel = typename Gemm::GemmKernel; - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); - - using StrideA = cute::Stride, int64_t>; - using StrideB = cute::Stride, int64_t>; - using StrideC = typename Gemm::StrideC; - - StrideA a_stride{lda, cute::Int<1>{}, 0}; - StrideB b_stride{ldb, cute::Int<1>{}, 0}; - StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}}; + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = StrideC; + using StrideAux = StrideC; typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b); + auto [M, N, K, L] = prob_shape; + + StrideA a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + StrideB b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + StrideC c_stride = + cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + StrideD d_stride = + cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + StrideAux aux_stride = d_stride; auto a_ptr = static_cast(a.data_ptr()); auto b_ptr = static_cast(b.data_ptr()); @@ -87,10 +94,11 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, b_stride}; auto c_ptr = static_cast(out.data_ptr()); + // auto d_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ Gemm::Epilogue::prepare_args( std::forward(epilogue_params)...), - c_ptr, c_stride, c_ptr, c_stride}; + c_ptr, c_stride, c_ptr, d_stride}; cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, epilogue_args); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh index d2f43e2b7a89..8f4df836bcc8 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh @@ -40,12 +40,7 @@ struct cutlass_3x_gemm { typename std::conditional, int32_t, float>::type; - using EpilogueDescriptor = - cutlass::epilogue::collective::detail::EpilogueDescriptor< - TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD, - ElementD, EpilogueSchedule>; - - using Epilogue = Epilogue_; + using Epilogue = Epilogue_; using StrideD = Stride, Int<0>>; using ElementC = void; @@ -88,4 +83,65 @@ struct cutlass_3x_gemm { struct GemmKernel : public KernelType {}; }; +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule> +struct cutlass_3x_gemm_sm100 { + using ElementAB = ElementAB_; + using LayoutA = cutlass::layout::RowMajor; + static constexpr int AlignmentA = + 128 / cutlass::sizeof_bits::value; + + using LayoutB = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = + 128 / cutlass::sizeof_bits::value; + + using ElementC = void; + using LayoutC = cutlass::layout::RowMajor; + static constexpr int AlignmentC = + 128 / cutlass::sizeof_bits::value; + + using ElementD = ElementD_; + using LayoutD = cutlass::layout::RowMajor; + static constexpr int AlignmentD = AlignmentC; + + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + using Epilogue = Epilogue_; + + // MMA type + using ElementAccumulator = float; + + // Epilogue types + using ElementBias = cutlass::half_t; + using ElementCompute = float; + using ElementAux = ElementD; + using LayoutAux = LayoutD; + using ElementAmax = float; + + using EVTCompute = typename Epilogue::EVTCompute; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape, + ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto, + ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC, + ElementD, LayoutD, AlignmentD, EpilogueSchedule, + EVTCompute>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB, + LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB, + ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, CollectiveMainloop, CollectiveEpilogue, void>; +}; + } // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp index 7ede9e067477..85272804774d 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp @@ -30,4 +30,10 @@ void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, torch::Tensor const& a_scales, torch::Tensor const& b_scales); +void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + } // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu new file mode 100644 index 000000000000..cf2cccc913f6 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu @@ -0,0 +1,24 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_sm100_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias) { + TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); + if (bias) { + TORCH_CHECK(bias->dtype() == out.dtype(), + "currently bias dtype must match output dtype ", out.dtype()); + return cutlass_scaled_mm_sm100_fp8_epilogue( + out, a, b, a_scales, b_scales, *bias); + } else { + return cutlass_scaled_mm_sm100_fp8_epilogue( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh new file mode 100644 index 000000000000..468b77d9593b --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -0,0 +1,67 @@ +#pragma once + +#include "scaled_mm.cuh" +#include "cutlass_gemm_caller.cuh" + +/** + * This file defines Gemm kernel configurations for SM100 (fp8) based on the + * Gemm shape. + */ + +namespace vllm { + +using c3x::cutlass_gemm_caller; + +template typename Epilogue> +struct sm100_fp8_config_default { + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_256, _128, _64>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue, + typename... EpilogueArgs> +inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); + + using Cutlass3xGemmDefault = + typename sm100_fp8_config_default::Cutlass3xGemm; + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); +} + +template