diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 32bd34c431c8..4d01a314adc4 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -426,7 +426,7 @@ main() {
 
   pip install -U transformers
 
-  pip install -r requirements-dev.txt
+  pip install -r requirements/dev.txt
   which genai-perf
 
   # check storage
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 731b71b6124e..706737e70d04 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -57,7 +57,7 @@ while true; do
 done
 
 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 
@@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_rand.py \
   --ignore=kernels/test_sampler.py \
   --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py"
+  --ignore=kernels/test_mamba_mixer2.py \
+  --ignore=kernels/test_aqlm.py \
+  --ignore=kernels/test_machete_mm.py \
+  --ignore=kernels/test_mha_attn.py \
+  --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_permute_cols.py"
 fi
 
 #ignore certain Entrypoints tests
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 2ead1f51ed81..f6dad818ddc0 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -35,7 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements-test.txt
+    pip install -r vllm/requirements/test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 38449c806464..5e11acd953cc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -35,7 +35,7 @@ steps:
   fast_check: true
   no_gpu: True
   commands:
-  - pip install -r requirements-docs.txt
+  - pip install -r ../../requirements/docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
   - grep \"sig sig-object py\" build/html/api/inference_params.html
@@ -78,6 +78,7 @@ steps:
   - tests/basic_correctness/test_preemption
   - tests/basic_correctness/test_cumem.py
   commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
@@ -115,6 +116,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   - tests/entrypoints/offline_mode
   commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
@@ -146,8 +148,10 @@ steps:
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
-  - python3 ../examples/offline_inference/rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
+  - pushd ../examples/offline_inference
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2
@@ -204,6 +208,7 @@ steps:
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample
     - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
     - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
     - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
     # TODO: accuracy does not match, whether setting
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 573675d67f86..3ecbd64a973b 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -1,5 +1,5 @@
 {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
 {% set default_working_dir = "vllm/tests" %}
 {% set hf_home = "/root/.cache/huggingface" %}
 
@@ -7,7 +7,7 @@ steps:
   - label: ":docker: build image"
     depends_on: ~
     commands:
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm  --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm  --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
       - "docker push {{ docker_image_amd }}"
     key: "amd-build"
     env:
@@ -27,7 +27,15 @@ steps:
     depends_on: 
       - "amd-build"
     agents:
+{% if step.amd_gpus and step.amd_gpus==8%}
       queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==4%}
+      queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==2%}
+      queue: amd_gpu
+{% else%}
+      queue: amd_gpu
+{% endif%}
     commands: 
       - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
     env:
diff --git a/.github/mergify.yml b/.github/mergify.yml
index e41107ae0a01..54f56210b286 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -36,6 +36,21 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-multi-modality
+  description: Automatically apply multi-modality label
+  conditions:
+    - or:
+      - files~=^vllm/multimodal/
+      - files~=^tests/multimodal/
+      - files~=^tests/models/multimodal/
+      - files~=^tests/models/*/audio_language/
+      - files~=^tests/models/*/vision_language/
+      - files=tests/models/test_vision.py
+  actions:
+    label:
+      add:
+        - multi-modality
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index f0a4e4baf1ae..2bb7b726194d 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -5,7 +5,7 @@ python_executable=python3
 
 # Update paths
 # Install requirements
-$python_executable -m pip install -r requirements-rocm.txt
+$python_executable -m pip install -r requirements/rocm.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
diff --git a/.gitignore b/.gitignore
index 89dab8f13bab..e40752f4dea0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,7 +197,7 @@ _build/
 hip_compat.h
 
 # Benchmark dataset
-benchmarks/*.json
+benchmarks/**/*.json
 
 # Linting
 actionlint
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 23a38d49638f..074ac9d122bf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,8 +44,8 @@ repos:
   rev: 0.6.2
   hooks:
     - id: pip-compile
-      args: [requirements-test.in, -o, requirements-test.txt]
-      files: ^requirements-test\.(in|txt)$
+      args: [requirements/test.in, -o, requirements/test.txt]
+      files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
   - id: mypy-local
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 284196bc2d27..2781ec223b66 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -18,4 +18,4 @@ formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
   install:
-    - requirements: docs/requirements-docs.txt
+    - requirements: requirements/docs.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11ecc5491997..557540e203f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # Only build AllSpark kernels if we are building for at least some compatible archs.
   cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (ALLSPARK_ARCHS)
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
     set(ALLSPARK_SRCS
        "csrc/quantization/gptq_allspark/allspark_repack.cu"
        "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
   else()
     message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                   " in CUDA target architectures, or CUDA not >= 12.0")
   endif()
 
+
+  set(SCALED_MM_3X_ARCHS)
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+  # CUDA 12.0 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                      "later if you intend on running FP8 quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
                      "in CUDA target architectures")
     endif()
+  endif()
 
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
+  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
   endif()
 
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+  # require CUDA 12.2 or later (and only work on Hopper and Blackwell).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
-    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
       message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                      "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                      "if you intend on running FP8 sparse quantized models on Hopper.")
@@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
-    )
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
@@ -534,6 +561,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
@@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
         "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
diff --git a/Dockerfile b/Dockerfile
index 63314b906f15..ff4a0839f6e0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,22 +14,21 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@@ -47,21 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 WORKDIR /workspace
 
-# install build and runtime dependencies
-
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-cuda.txt
+    uv pip install -r requirements/cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -79,15 +76,19 @@ FROM base AS build
 ARG TARGETPLATFORM
 
 # install build dependencies
-COPY requirements-build.txt requirements-build.txt
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-build.txt
+    uv pip install -r requirements/build.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -124,6 +125,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
@@ -143,11 +147,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev
 
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-dev.txt
+    uv pip install -r requirements/dev.txt
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
@@ -163,23 +171,22 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -193,13 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install dist/*.whl --verbose
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -214,9 +221,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 
@@ -224,9 +230,9 @@ COPY examples examples
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements-build.txt requirements-build.txt
+COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-build.txt
+    uv pip install -r requirements/build.txt
 
 #################### vLLM installation IMAGE ####################
 
@@ -237,17 +243,21 @@ FROM vllm-base AS test
 
 ADD . /vllm-workspace/
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-dev.txt
+    uv pip install -r requirements/dev.txt
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
+    uv pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
 # Copy in the v1 package for testing (it isn't distributed yet)
@@ -265,12 +275,16 @@ RUN mv vllm test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.arm b/Dockerfile.arm
index 093ee2209222..bad093684239 100644
--- a/Dockerfile.arm
+++ b/Dockerfile.arm
@@ -26,18 +26,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
     pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+    pip install -r requirements/build.txt
 
 FROM cpu-test-arm AS build
 
 WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ebe226cf6d14..08a4e188f4c1 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -29,18 +29,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
     pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+    pip install -r requirements/build.txt
 
 FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index 66cf68c32f2c..48211c88f872 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -4,7 +4,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-hpu.txt
+RUN pip install -v -r requirements/hpu.txt
 
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 27658d836d98..067645906366 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -36,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN python3 -m pip install -U \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements-neuron.txt
+        -r requirements/neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 32bcbfa9cc16..445c70ab89d4 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -16,7 +16,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN python3 -m pip install -U pip
 # install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
 # build vLLM with OpenVINO backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
 
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index c4c1f3e35797..c5ca20d76e3e 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -6,7 +6,7 @@ ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
 RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
-# Some packages in requirements-cpu are installed here
+# Some packages in requirements/cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
 RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements-cpu.txt \
+        -r requirements/cpu.txt \
         xformers uvloop==0.20.0
 
 RUN --mount=type=bind,source=.git,target=.git \
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 009e929ebace..d2c2d3b14678 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -38,14 +38,14 @@ FROM fetch_vllm AS build_vllm
 ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
-    && python3 -m pip install -r requirements-rocm.txt \
+    && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
     && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
@@ -60,7 +60,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
-    && pip install -U -r requirements-rocm.txt \
+    && pip install -U -r requirements/rocm.txt \
     && pip uninstall -y vllm \
     && pip install *.whl
 
@@ -99,7 +99,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
-    && pip install -U -r requirements-rocm.txt \
+    && pip install -U -r requirements/rocm.txt \
     && pip uninstall -y vllm \
     && pip install *.whl
 
diff --git a/Dockerfile.s390x b/Dockerfile.s390x
new file mode 100644
index 000000000000..5a84dc12d8f7
--- /dev/null
+++ b/Dockerfile.s390x
@@ -0,0 +1,152 @@
+# Base UBI image for s390x architecture
+ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
+ARG PYTHON_VERSION=3.12
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+
+# Install basic dependencies
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Install development utilities
+RUN microdnf install -y \
+    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
+    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
+    microdnf clean all
+
+# Python Installation
+FROM base AS python-install
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel  && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+
+FROM python-install AS pyarrow
+
+# Build Apache Arrow
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/apache/arrow.git && \
+    cd arrow/cpp && \
+    mkdir release && cd release && \
+    cmake -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=/usr/local \
+          -DARROW_PYTHON=ON \
+          -DARROW_PARQUET=ON \
+          -DARROW_ORC=ON \
+          -DARROW_FILESYSTEM=ON \
+          -DARROW_WITH_LZ4=ON \
+          -DARROW_WITH_ZSTD=ON \
+          -DARROW_WITH_SNAPPY=ON \
+          -DARROW_JSON=ON \
+          -DARROW_CSV=ON \
+          -DARROW_DATASET=ON \
+          -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \
+          -DARROW_DEPENDENCY_SOURCE=BUNDLED \
+          .. && \
+    make -j$(nproc) && \
+    make install && \
+    cd ../../python && \
+    export PYARROW_PARALLEL=4 && \
+    export ARROW_BUILD_TYPE=release && \
+    uv pip install -r requirements/build.txt && \
+    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
+
+FROM python-install AS numa-build
+# Install numactl (needed for numa.h dependency)
+WORKDIR /tmp
+RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \
+    tar -xvzf v2.0.16.tar.gz && \
+    cd numactl-2.0.16 && \
+    ./autogen.sh && \
+    ./configure && \
+    make
+
+# Set include path
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+
+FROM python-install AS rust
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    . "$CARGO_HOME/env" && \
+    rustup default stable && \
+    rustup show
+
+FROM python-install AS torch-vision
+# Install torchvision
+ARG TORCH_VERSION=2.7.0.dev20250304
+ARG TORCH_VISION_VERSION=v0.20.1
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/pytorch/vision.git && \
+    cd vision && \
+    git checkout $TORCH_VISION_VERSION && \
+    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
+    python setup.py bdist_wheel
+
+# Final build stage
+FROM python-install AS vllm-cpu
+ARG PYTHON_VERSION
+
+# Set correct library path for torch and numactl
+ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+ENV UV_LINK_MODE=copy
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+COPY . /workspace/vllm
+WORKDIR /workspace/vllm
+
+RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \
+    make -C /numactl install
+
+# Install dependencies, including PyTorch and Apache Arrow
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
+    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
+     sed -i '/^torch/d' requirements/build.txt && \
+     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
+     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
+    uv pip install -v \    
+        $ARROW_WHL_FILE  \
+        $VISION_WHL_FILE \
+        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        --index-strategy unsafe-best-match \
+        -r requirements/build.txt \
+        -r requirements/cpu.txt 
+
+# Build and install vllm
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    uv pip install "$(echo dist/*.whl)[tensorizer]"
+
+# setup non-root user for vllm
+RUN umask 002 && \
+    useradd --uid 2000 --gid 0 vllm && \
+    mkdir -p /home/vllm && \
+    chmod g+rwx /home/vllm
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+USER 2000
+WORKDIR /home/vllm
+
+# Set the default entrypoint
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index e268b3947666..960dc8e9ed9b 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -19,7 +19,7 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        -r requirements-tpu.txt
+        -r requirements/tpu.txt
 RUN python3 setup.py develop
 
 # install development dependencies (for testing)
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index a374f20d7d94..530809bcd4df 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -25,12 +25,12 @@ RUN apt-get update -y && \
     wget
 
 WORKDIR /workspace/vllm
-COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
-COPY requirements-common.txt /workspace/vllm/requirements-common.txt
+COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
-    -r requirements-xpu.txt
+    -r requirements/xpu.txt
 
 RUN git clone https://github.com/intel/pti-gpu && \
     cd pti-gpu/sdk && \
diff --git a/MANIFEST.in b/MANIFEST.in
index 82be639ef4d7..82fd22b845f0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,9 @@
 include LICENSE
-include requirements-common.txt
-include requirements-cuda.txt
-include requirements-rocm.txt
-include requirements-neuron.txt
-include requirements-cpu.txt
+include requirements/common.txt
+include requirements/cuda.txt
+include requirements/rocm.txt
+include requirements/neuron.txt
+include requirements/cpu.txt
 include CMakeLists.txt
 
 recursive-include cmake *
diff --git a/README.md b/README.md
index f22a1f9c5c80..49d6d525161d 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,17 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
+We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
+
+Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
+
+👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
 
 ---
 
 *Latest News* 🔥
 
+- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 158705769b5e..d53428d219e7 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -27,7 +27,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     model_name: Optional[str] = None
-    best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
@@ -58,7 +57,6 @@ async def async_request_tgi(
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
         params = {
-            "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
             "do_sample": True,
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
@@ -130,7 +128,6 @@ async def async_request_trt_llm(
 
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
             "text_input": request_func_input.prompt,
@@ -195,7 +192,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -249,7 +245,6 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
-            "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
new file mode 100644
index 000000000000..30fffdda491d
--- /dev/null
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,667 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+
+TODO: Implement CustomDataset to parse a JSON file and convert its contents into
+SampleRequest instances, similar to the approach used in ShareGPT.
+"""
+
+import base64
+import io
+import json
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from typing import Any, Optional, Union
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+
+    # num_requests has default 1000 in both the benchmark_serving.py and
+    # benchmark_throughput.py
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+        
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+        
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+        
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+        
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+        
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(self, tokenizer: PreTrainedTokenizerBase,
+               num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+        
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+        
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+        
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    For a PIL.Image.Image input:
+      - Converts the image to RGB.
+      - Saves the image as a JPEG in-memory.
+      - Encodes the JPEG data as a base64 string.
+      - Returns a dictionary with the image as a base64 data URL.
+
+    For a string input:
+      - Treats the string as a URL or file path.
+      - Prepends "file://" if the string doesn't start with "http://" or
+        "file://".
+      - Returns a dictionary with the image URL.
+
+    Raises:
+      ValueError: If the input is neither a PIL.Image.Image nor a string.
+    """
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               range_ratio: float = DEFAULT_RANGE_RATIO,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               **kwargs) -> list[SampleRequest]:
+
+        vocab_size = tokenizer.vocab_size
+
+        prefix_token_ids = (np.random.randint(
+            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+
+        input_low = int(input_len * range_ratio)
+        output_low = int(output_len * range_ratio)
+
+        input_lens = np.random.randint(input_low,
+                                       input_len + 1,
+                                       size=num_requests)
+        output_lens = np.random.randint(output_low,
+                                        output_len + 1,
+                                        size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            total_input_len = prefix_len + int(input_lens[i])
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                ))
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               lora_path: Optional[str] = None,
+               max_loras: Optional[int] = None,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = entry["conversations"][0]["value"],\
+                entry["conversations"][1]["value"]
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = (len(completion_ids)
+                              if output_len is None else output_len)
+            if not is_valid_sequence(prompt_len,
+                                     new_output_len,
+                                     skip_min_output_len_check=output_len
+                                     is not None):
+                continue
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(self,
+               tokenizer,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               return_prompt_formatted: bool = False,
+               **kwargs) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens)
+                      for tokens in \
+                        tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(base_msg,
+                                                 add_generation_prompt=True,
+                                                 tokenize=False)
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset}).")
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        for _ in range(num_requests):
+            extra_lines = random.choices(self.data,
+                                         k=num_input_lines - num_prefix_lines)
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False)
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt_formatted
+                    if return_prompt_formatted else prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self, ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests,
+                                    random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               max_loras: Optional[int] = None,
+               lora_path: Optional[str] = None,
+               **kwargs) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class HuggingFaceDataset(BenchmarkDataset):
+    """
+    Dataset class for processing a HuggingFace dataset with conversation data
+    and optional images.
+    """
+    DEFAULT_NUM_REQUESTS = 1000
+
+    def __init__(
+        self,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+
+        if "conversations" not in self.data.features:
+            raise ValueError("HF Dataset must have a 'conversations' column.")
+
+        # Shuffle and filter examples with at least 2 conversations.
+        self.data = self.data.shuffle(seed=self.random_seed).filter(
+            lambda x: len(x["conversations"]) >= 2)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               lora_path: Optional[str] = None,
+               max_loras: Optional[int] = None,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer, lora_path=lora_path, max_loras=max_loras)
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                    prompt_len, completion_len):
+                continue
+
+            mm_content = process_image(
+                item["image"]) if "image" in item else None
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    lora_request=lora_request,
+                ))
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(BenchmarkDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    DEFAULT_NUM_REQUESTS = 1000
+    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
+
+    def __init__(
+        self,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+
+        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
+            raise ValueError(f"Only support Vision Arena dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               **kwargs) -> list:
+        # TODO (jenniferzhao): Add support for offline benchmark sampling
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0][0]["content"]
+            prompt_len = len(tokenizer(prompt).input_ids)
+            mm_content = process_image(item["images"][0])
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        return sampled_requests
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
deleted file mode 100644
index 2e0f6c6b5d20..000000000000
--- a/benchmarks/benchmark_guided.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Benchmark guided decoding throughput."""
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-
-import datasets
-import pandas as pd
-import uvloop
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
-from vllm.sampling_params import GuidedDecodingParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-@dataclasses.dataclass
-class SampleRequest:
-    """A class representing a single inference request for benchmarking.
-
-    Attributes:
-        prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
-        prompt_len: The length of the prompt in tokens.
-        expected_output_len: The expected length of the output in tokens.
-    """
-    prompt: str
-    prompt_len: int
-    expected_output_len: int
-    schema: dict
-    structure_type: str = 'json'
-    completion: str = None
-
-
-def run_vllm(requests: list[SampleRequest],
-             engine_args: EngineArgs,
-             n: int,
-             guided_decoding_rate: float = 1.0,
-             warmup: bool = False) -> float:
-    from vllm import LLM, SamplingParams
-    llm = LLM(**vars(engine_args))
-    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests.")
-
-    # Add the requests to the engine.
-    prompts: list[str] = []
-    sampling_params: list[SamplingParams] = []
-    # create a list containing random selected true or false
-    guided_decoding_req_idx = random.sample(
-        range(len(requests)), int(len(requests) * guided_decoding_rate))
-
-    if warmup:
-        print(">>>>> Running warmup prompt, for the first 5")
-        # We setup the first 5 requests to warmup FSM
-        # if using xgrammar dataset, we will skip warmup
-        warmup_requests = requests[:5]
-        for i, request in enumerate(warmup_requests):
-            prompts.append(request.prompt)
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    guided_decoding=GuidedDecodingParams(json=request.schema)
-                    if guided_decoding_rate > 0 else None,
-                ))
-        llm.generate(prompts, sampling_params, use_tqdm=False)
-
-    print(">>>>> Benchmark started...")
-    prompts = []
-    sampling_params = []
-    for i, request in enumerate(requests):
-        prompts.append(request.prompt)
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                guided_decoding=GuidedDecodingParams(
-                    **{request.structure_type: request.schema})
-                if i in guided_decoding_req_idx else None,
-            ))
-
-    start = time.perf_counter()
-    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
-    ret = []
-    for output, request in zip(outputs, requests):
-        generated_text = output.outputs[0].text
-        ret.append({
-            "generated": generated_text,
-            "expected": request.completion
-        })
-    end = time.perf_counter()
-    return end - start, ret
-
-
-async def run_vllm_async(
-        requests: list[SampleRequest],
-        engine_args: AsyncEngineArgs,
-        n: int,
-        guided_decoding_rate: float = 1.0,
-        warmup: bool = False,
-        disable_frontend_multiprocessing: bool = False) -> float:
-    from vllm import SamplingParams
-
-    async with build_async_engine_client_from_engine_args(
-            engine_args, disable_frontend_multiprocessing) as llm:
-
-        assert all(
-            llm.model_config.max_model_len >= (request.prompt_len +
-                                               request.expected_output_len)
-            for request in requests), (
-                "Please ensure that max_model_len is greater than the sum of"
-                " prompt_len and expected_output_len for all requests.")
-
-        # Add the requests to the engine.
-        prompts: list[str] = []
-        sampling_params: list[SamplingParams] = []
-        guided_decoding_req_idx = random.sample(
-            range(len(requests)), int(len(requests) * guided_decoding_rate))
-
-        if warmup:
-            print(">>>>>> Running warmup prompt, for the first 5")
-            # We setup the first 5 requests to warmup FSM
-            # if using xgrammar dataset, we will skip warmup
-            warmup_requests = requests[:5]
-            for i, request in enumerate(warmup_requests):
-                prompts.append(request.prompt)
-                sampling_params.append(
-                    SamplingParams(
-                        n=n,
-                        temperature=1.0,
-                        top_p=1.0,
-                        ignore_eos=True,
-                        max_tokens=request.expected_output_len,
-                        guided_decoding=GuidedDecodingParams(
-                            json=request.schema)
-                        if guided_decoding_rate > 0 else None,
-                    ))
-            generators = []
-            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-                generator = llm.generate(prompt, sp, request_id=f"test{i}")
-                generators.append(generator)
-            all_gens = merge_async_iterators(*generators)
-            async for i, res in all_gens:
-                pass
-
-        print(">>>>> Benchmark started...")
-        prompts = []
-        sampling_params = []
-        for i, request in enumerate(requests):
-            prompts.append(request.prompt)
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    guided_decoding=GuidedDecodingParams(json=request.schema)
-                    if i in guided_decoding_req_idx else None,
-                ))
-
-        generators = []
-        start_time = []
-        latencies = []
-        start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
-            generators.append(generator)
-            start_time.append(time.perf_counter())
-            latencies.append([])
-        all_gens = merge_async_iterators(*generators)
-        generated_texts = [''] * len(requests)
-        async for i, res in all_gens:
-            generated_texts[i] = res.outputs[0].text
-            lat = time.perf_counter() - start_time[i]
-            latencies[i].append(lat)
-        ret = [{
-            'generated': gt,
-            'expected': req.completion
-        } for gt, req in zip(generated_texts, requests)]
-        end = time.perf_counter()
-        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
-        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
-                                  for lat in latencies])
-        return end - start, ret, (first_latency, next_latency)
-
-
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json':
-        if args.json_schema_path is None:
-            dir_path = os.path.dirname(os.path.realpath(__file__))
-            args.json_schema_path = os.path.join(dir_path,
-                                                 "structured_schemas",
-                                                 "structured_schema_1.json")
-        with open(args.json_schema_path) as f:
-            schema = json.load(f)
-        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=schema,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "grammar":
-        schema = """
-            ?start: select_statement
-
-            ?select_statement: "SELECT " column_list " FROM " table_name
-
-            ?column_list: column_name ("," column_name)*
-
-            ?table_name: identifier
-
-            ?column_name: identifier
-
-            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-        """
-        prompt = "Generate an SQL query to show the 'username' \
-            and 'email' from the 'users' table."
-
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=schema,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "regex":
-        regex = r"\w+@\w+\.com\n"
-        args.regex = regex
-        prompt = "Generate an email address for Alan Turing, \
-            who works in Enigma. End in .com and new line. \
-                Example result: alan.turing@enigma.com\n"
-
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=regex,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "choice":
-        choice = ["Positive", "Negative"]
-        args.choice = choice
-        prompt = "Classify this sentiment: vLLM is wonderful!"
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=choice,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "xgrammar_bench":
-        args.warmup = False
-        requests: list[SampleRequest] = []
-        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
-                                        split="train")
-        print(f"dataset has {len(dataset)} entries")
-        len_dataset = len(dataset)
-        for data_point_idx in range(args.num_prompts):
-            idx = data_point_idx
-            while idx >= len_dataset:
-                idx -= len_dataset
-            schema = dataset["schema"][idx]
-            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False)
-            input_len = len(tokenizer(prompt).input_ids)
-            completion = dataset["completion"][idx]
-
-            requests.append(
-                SampleRequest(prompt=prompt,
-                              prompt_len=input_len,
-                              expected_output_len=args.output_len,
-                              schema=schema,
-                              completion=completion))
-
-    return requests
-
-
-def evaluate(ret, args):
-
-    def _eval_correctness_json(expected, actual):
-        # extract json string from string using regex
-        import re
-        actual = actual.replace('\n', '').replace(' ', '').strip()
-        try:
-            actual = re.search(r'\{.*\}', actual).group()
-            actual = json.loads(actual)
-        except Exception:
-            return False
-
-        return True
-
-    def _eval_correctness_choice(expected, actual):
-        return actual in args.choice
-
-    def _eval_correctness_regex(expected, actual):
-        import re
-        return re.match(args.regex, actual) is not None
-
-    def _eval_correctness(expected, actual):
-        if args.structure_type == 'json':
-            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == 'regex':
-            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == 'choice':
-            return _eval_correctness_choice(expected, actual)
-        else:
-            return None
-
-    scores = []
-    for res in ret:
-        score = _eval_correctness(res['expected'], res['generated'])
-        res['correctness'] = score
-        scores.append(score)
-
-    not_none_scores = [score for score in scores if score is not None]
-
-    return (sum(not_none_scores) / len(not_none_scores) *
-            100) if len(not_none_scores) > 0 else None
-
-
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-
-    # async engine is working for 'regex', 'choice' and 'grammar'
-    if args.dataset == 'grammar':
-        args.structure_type = 'grammar'
-        args.async_engine = False
-    elif args.dataset == 'regex':
-        args.structure_type = 'regex'
-        args.async_engine = False
-    elif args.dataset == 'choice':
-        args.structure_type = 'choice'
-        args.async_engine = False
-    else:
-        args.structure_type = 'json'
-
-    if args.no_guided_decoding:
-        args.guided_decoding_ratio = 0
-    if args.save_results:
-        result_file_name = f'{args.guided_decoding_ratio}guided'
-        result_file_name += f"_{args.model.split('/')[-1]}"
-        result_file_name += f"_{args.dataset}"
-        result_file_name += f"_{args.num_prompts}"
-        result_file_name += f"_out{args.output_len}"
-        result_file_name += f"_async{args.async_engine}"
-        result_file_name += f"_warmup{args.warmup}"
-        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
-        result_file_name += ".txt"
-    else:
-        result_file_name = None
-
-    # Synthesize a prompt with the given input length.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    requests = sample_requests(tokenizer, args)
-
-    if args.async_engine:
-        engine_args = AsyncEngineArgs.from_cli_args(args)
-        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
-            run_vllm_async(requests, engine_args, args.n,
-                           args.guided_decoding_ratio, args.warmup,
-                           args.disable_frontend_multiprocessing))
-    else:
-        engine_args = EngineArgs.from_cli_args(args)
-        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
-                                     args.guided_decoding_ratio, args.warmup)
-        first_latency, next_latency = None, None
-
-    score = evaluate(ret, args)
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-                           for request in requests)
-    total_output_tokens = sum(request.expected_output_len
-                              for request in requests)
-    if first_latency is not None:
-        latency_breakdown = "\nFirst token latency(msecs):\n"
-        latency_breakdown += f"{first_latency.describe()}"
-        latency_breakdown += "\nNext token latency(msecs):\n"
-        latency_breakdown += f"{next_latency.describe()}"
-    print(
-        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
-        f"Correct rate is {score} %",
-        f"{latency_breakdown if first_latency is not None else ''}")
-
-    # Output JSON results if specified
-    if args.output_json or result_file_name:
-        results = {
-            "elapsed_time": elapsed_time,
-            "num_requests": len(requests),
-            "total_num_tokens": total_num_tokens,
-            "total_output_tokens": total_output_tokens,
-            "requests_per_second": len(requests) / elapsed_time,
-            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
-            "output_tokens_per_second":
-            f"{total_output_tokens / elapsed_time:.2f}",
-            "correct_rate(%)": score
-        }
-        results = {"outputs": ret, **results}
-        if first_latency is not None:
-            results["first_token_latency(msecs)"] = first_latency.describe(
-            ).to_dict()
-            results["next_token_latency(msecs)"] = next_latency.describe(
-            ).to_dict()
-        if args.output_json:
-            with open(args.output_json, "w") as f:
-                json.dump(results, f, indent=4)
-        elif result_file_name:
-            with open(result_file_name, "w") as f:
-                json.dump(results, f, indent=4)
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
-    parser = AsyncEngineArgs.add_cli_args(parser)
-
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=512,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument(
-        "--dataset",
-        default='json',
-        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
-    parser.add_argument("--json_schema_path",
-                        type=str,
-                        default=None,
-                        help="Path to json schema.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=10,
-                        help="Number of prompts to process.")
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the throughput results in JSON format.')
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--no-guided-decoding",
-                        action='store_true',
-                        default=False,
-                        help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--guided-decoding-ratio",
-                        type=float,
-                        default=1.0,
-                        help="Ratio of Guided Decoding requests")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
-    parser.add_argument("--warmup",
-                        action="store_true",
-                        default=False,
-                        help="Run warmup prompts before benchmark.")
-    parser.add_argument("--save-results",
-                        action="store_true",
-                        default=False,
-                        help="save output results.")
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    main(args)
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d7f39f50f6ca..dfd9bb1e6a4d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
         top_p=1.0,
         ignore_eos=True,
         max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
     )
     print(sampling_params)
     dummy_prompt_token_ids = np.random.randint(10000,
@@ -173,6 +174,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help="Path to save the latency results in JSON format.",
     )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index fba32520442f..4fff7a8fc8ed 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -194,7 +194,9 @@ def main(args):
 
     llm = LLM(**dataclasses.asdict(engine_args))
 
-    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    sampling_params = SamplingParams(temperature=0,
+                                     max_tokens=args.output_len,
+                                     detokenize=not args.disable_detokenize)
 
     print("Testing filtered requests")
     prompts = repeat_and_sort_requests(filtered_requests,
@@ -243,6 +245,12 @@ def main(args):
         "subtract this length when filtering prompts. Only used "
         "when dataset-path is not provided.",
     )
+    parser.add_argument(
+        '--disable-detokenize',
+        action='store_true',
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 43b2c1b03323..76fe00ede249 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -23,7 +23,7 @@ def sample_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> list[tuple[str, int, int]]:
+) -> list[tuple[str, int, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -71,6 +71,7 @@ def run_vllm(
     requests: list[tuple[str, int, int]],
     n: int,
     engine_args: EngineArgs,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
@@ -95,6 +96,7 @@ def run_vllm(
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=output_len,
+                detokenize=not disable_detokenize,
             ))
 
     start = time.perf_counter()
@@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
 
     if args.backend == "vllm":
         elapsed_time = run_vllm(requests, args.n,
-                                EngineArgs.from_cli_args(args))
+                                EngineArgs.from_cli_args(args),
+                                args.disable_detokenize)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(prompt_len + output_len
@@ -174,6 +177,12 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--disable-detokenize',
+        action='store_true',
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 16ec0a4817a2..1dd01ca96867 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,25 +25,20 @@
 """
 import argparse
 import asyncio
-import base64
 import gc
-import io
 import json
 import os
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator, Collection
+from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional
 
 import numpy as np
-import pandas as pd
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                   RequestFuncOutput)
-from datasets import load_dataset
-from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
@@ -57,6 +52,9 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -92,325 +90,18 @@ class BenchmarkMetrics:
     percentiles_e2el_ms: list[tuple[float, float]]
 
 
-def sample_sharegpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, int, int, None]]:
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: list[tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append((prompt, prompt_len, output_len, None))
-
-    return filtered_dataset
-
-
-def sample_burstgpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    random_seed: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int, None]]:
-    df = pd.read_csv(dataset_path)
-    gpt4_df = df[df["Model"] == "GPT-4"]
-    # Remove the failed requests (i.e., response length is 0)
-    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
-    # Randomly sample num_requests from the dataset
-    if num_requests <= len(gpt4_df):
-        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
-    else:
-        gpt4_df = gpt4_df.sample(n=num_requests,
-                                 random_state=random_seed,
-                                 replace=True)
-    # Convert the dataframe to a list of tuples
-    dataset = gpt4_df.values.tolist()
-    input_requests = []
-    for i in range(num_requests):
-        input_len = int(dataset[i][2])
-        output_len = int(dataset[i][3])
-        prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
-                                   for j in range(input_len)])
-        input_requests.append((prompt, input_len, output_len, None))
-    return input_requests
-
-
-def sample_sonnet_requests(
-    dataset_path: str,
-    num_requests: int,
-    input_len: int,
-    output_len: int,
-    prefix_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, str, int, int, None]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
-
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        poem_lines = f.readlines()
-
-    # Tokenize the poem lines.
-    poem_token_ids = tokenizer(poem_lines).input_ids
-    average_poem_len = sum(
-        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
-
-    # Base prefix for all requests.
-    base_prompt = "Pick as many lines as you can from these poem lines:\n"
-    base_message = [{
-        "role": "user",
-        "content": base_prompt,
-    }]
-    base_prompt_formatted = tokenizer.apply_chat_template(
-        base_message, add_generation_prompt=True, tokenize=False)
-    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
-
-    assert (
-        input_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
-    num_input_lines = round(
-        (input_len - base_prompt_offset) / average_poem_len)
-
-    # First approximately `prefix_len` number of tokens in the
-    # prompt are fixed poem lines.
-    assert (
-        prefix_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
-
-    num_prefix_lines = round(
-        (prefix_len - base_prompt_offset) / average_poem_len)
-    prefix_lines = poem_lines[:num_prefix_lines]
-
-    # Sample the rest of lines per request.
-    sampled_requests: list[tuple[str, int, int]] = []
-    for _ in range(num_requests):
-        num_lines_needed = num_input_lines - num_prefix_lines
-        sampled_lines = "".join(prefix_lines +
-                                random.choices(poem_lines, k=num_lines_needed))
-
-        prompt = f"{base_prompt}{sampled_lines}"
-        message = [
-            {
-                "role": "user",
-                "content": prompt,
-            },
-        ]
-        prompt_formatted = tokenizer.apply_chat_template(
-            message, add_generation_prompt=True, tokenize=False)
-        prompt_len = len(tokenizer(prompt_formatted).input_ids)
-        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len, None))
-
-    return sampled_requests
-
-
-def sample_vision_arena_requests(
-    dataset,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
-    sampled_requests: list[tuple[str, int, int, dict[str,
-                                                     Collection[str]]]] = []
-    for data in dataset:
-        if len(sampled_requests) == num_requests:
-            break
-
-        prompt = data["turns"][0][0]['content']
-
-        prompt_token_ids = tokenizer(prompt).input_ids
-        if fixed_output_len is None:
-            # Default max output len is set to 128
-            print("--hf-output-len is not provided. Using default value 128.")
-            fixed_output_len = 128
-
-        prompt_len = len(prompt_token_ids)
-        output_len = fixed_output_len
-
-        assert isinstance(
-            data["images"][0],
-            Image), ("Input image format must be `PIL.Image.Image`, "
-                     f"given {type(data['image'])}.")
-        image: Image = data["images"][0]
-        image = image.convert("RGB")
-        image_data = io.BytesIO()
-        image.save(image_data, format='JPEG')
-        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
-        mm_content = {
-            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
-        }
-
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-
-    return sampled_requests
-
-
-def sample_hf_requests(
-    dataset_path: str,
-    dataset_subset: Optional[str],
-    dataset_split: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    random_seed: int,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
-
-    # Special case for vision_arena dataset
-    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
-        and dataset_subset is None:
-        assert dataset_split == "train"
-        dataset = load_dataset(dataset_path,
-                               name=dataset_subset,
-                               split=dataset_split,
-                               streaming=True)
-        dataset = dataset.shuffle(seed=random_seed)
-        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
-                                            fixed_output_len)
-
-    dataset = load_dataset(dataset_path,
-                           name=dataset_subset,
-                           split=dataset_split,
-                           streaming=True)
-    assert "conversations" in dataset.features, (
-        "HF Dataset must have 'conversations' column.")
-    filter_func = lambda x: len(x["conversations"]) >= 2
-    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-    sampled_requests: list[tuple[str, int, int, dict[str,
-                                                     Collection[str]]]] = []
-    for data in filtered_dataset:
-        if len(sampled_requests) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = data["conversations"][0]["value"]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = data["conversations"][1]["value"]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
-            # Prune too short sequences.
-            continue
-        if fixed_output_len is None and \
-            (prompt_len > 1024 or prompt_len + output_len > 2048):
-            # Prune too long sequences.
-            continue
-
-        if "image" in data and isinstance(data["image"], Image):
-            image: Image = data["image"]
-            image = image.convert("RGB")
-            image_data = io.BytesIO()
-            image.save(image_data, format='JPEG')
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
-            mm_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/jpeg;base64,{image_base64}"
-                },
-            }
-        elif "image" in data and isinstance(data["image"], str):
-            if (data["image"].startswith("http://") or \
-                data["image"].startswith("file://")):
-                image_url = data["image"]
-            else:
-                image_url = f"file://{data['image']}"
-
-            mm_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                },
-            }
-        else:
-            mm_content = None
-
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-
-    return sampled_requests
-
-
-def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0,
-                                         tokenizer.vocab_size,
-                                         size=prefix_len).tolist()
-
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(prefix_token_ids +
-                                  [(offsets[i] + i + j) % tokenizer.vocab_size
-                                   for j in range(input_lens[i])])
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]),
-                               int(output_lens[i]), None))
-
-    return input_requests
-
-
 async def get_request(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[tuple[str, int, int], None]:
+) -> AsyncGenerator[SampleRequest, None]:
     """
     Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
 
     Args:
         input_requests:
-            A list of input requests, each represented as a tuple.
+            A list of input requests, each represented as a SampleRequest.
         request_rate:
             The rate at which requests are generated (requests/s).
         burstiness (optional):
@@ -422,7 +113,7 @@ async def get_request(
             in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
-    input_requests = iter(input_requests)
+    input_requests: Iterable[SampleRequest] = iter(input_requests)
 
     # Calculate scale parameter theta to maintain the desired request_rate.
     assert burstiness > 0, (
@@ -444,7 +135,7 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
@@ -475,7 +166,7 @@ def calculate_metrics(
                     tokenizer(outputs[i].generated_text,
                               add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
+            total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
                 latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@@ -558,19 +249,18 @@ async def benchmark(
     model_id: str,
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     logprobs: Optional[int],
-    best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: list[str],
-    selected_percentiles: list[str],
+    selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
-    lora_modules: Optional[list[str]],
+    lora_modules: Optional[Iterable[str]],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -578,12 +268,16 @@ async def benchmark(
         raise ValueError(f"Unknown backend: {backend}")
 
     print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0])
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = \
+        input_requests[0].prompt, input_requests[0].prompt_len, \
+        input_requests[0].expected_output_len, \
+            input_requests[0].multi_modal_data
+
     if backend != "openai-chat" and test_mm_content is not None:
         # multi-modal benchmark is only available on OpenAI Chat backend.
         raise ValueError(
             "Multi-modal content is only supported on 'openai-chat' backend.")
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -592,7 +286,6 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         logprobs=logprobs,
-        best_of=best_of,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )
@@ -608,7 +301,8 @@ async def benchmark(
     if lora_modules:
         # For each input request, choose a LoRA module at random.
         lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))])
+            [random.choice(lora_modules) \
+                for _ in range(len(input_requests))])
 
     if profile:
         print("Starting profiler...")
@@ -619,7 +313,6 @@ async def benchmark(
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
                                          logprobs=logprobs,
-                                         best_of=best_of,
                                          multi_modal_content=test_mm_content,
                                          ignore_eos=ignore_eos)
         profile_output = await request_func(request_func_input=profile_input)
@@ -655,7 +348,9 @@ async def limited_request_func(request_func_input, pbar):
     benchmark_start_time = time.perf_counter()
     tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
+        prompt, prompt_len, output_len, mm_content = request.prompt, \
+            request.prompt_len, request.expected_output_len, \
+                request.multi_modal_data
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
             req_lora_module = next(lora_modules)
@@ -668,7 +363,6 @@ async def limited_request_func(request_func_input, pbar):
                                               prompt_len=prompt_len,
                                               output_len=output_len,
                                               logprobs=logprobs,
-                                              best_of=best_of,
                                               multi_modal_content=mm_content,
                                               ignore_eos=ignore_eos)
         tasks.append(
@@ -686,7 +380,6 @@ async def limited_request_func(request_func_input, pbar):
             prompt_len=test_prompt_len,
             output_len=test_output_len,
             logprobs=logprobs,
-            best_of=best_of,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -872,76 +565,72 @@ def main(args: argparse.Namespace):
             "Please specify '--dataset-name' and the corresponding "
             "'--dataset-path' if required.")
 
-    elif args.dataset_name == "sharegpt":
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-
-    elif args.dataset_name == "burstgpt":
-        input_requests = sample_burstgpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            random_seed=args.seed,
-            tokenizer=tokenizer,
-        )
-
-    elif args.dataset_name == "sonnet":
-        # Do not format the prompt, pass to message directly
+    if args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
         if args.backend == "openai-chat":
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [(prompt, prompt_len, output_len, None)
-                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
+            input_requests = dataset.sample(num_requests=args.num_prompts,
+                                            input_len=args.sonnet_input_len,
+                                            output_len=args.sonnet_output_len,
+                                            prefix_len=args.sonnet_prefix_len,
+                                            tokenizer=tokenizer,
+                                            return_prompt_formatted=False)
         else:
-            assert (
-                tokenizer.chat_template or tokenizer.default_chat_template
-            ), "Tokenizer/model must have chat template for sonnet dataset."
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [(prompt_formatted, prompt_len, output_len, None)
-                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset.")
+            input_requests = dataset.sample(num_requests=args.num_prompts,
+                                            input_len=args.sonnet_input_len,
+                                            output_len=args.sonnet_output_len,
+                                            prefix_len=args.sonnet_prefix_len,
+                                            tokenizer=tokenizer,
+                                            return_prompt_formatted=True)
 
     elif args.dataset_name == "hf":
-        input_requests = sample_hf_requests(
+        # Choose between VisionArenaDataset
+        # and HuggingFaceDataset based on provided parameters.
+        dataset_class = (VisionArenaDataset if args.dataset_path
+                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                         and args.hf_subset is None else HuggingFaceDataset)
+        input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
             dataset_split=args.hf_split,
+        ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             random_seed=args.seed,
-            fixed_output_len=args.hf_output_len,
-        )
-
-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
+            output_len=args.hf_output_len,
         )
 
     else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt":
+            lambda: ShareGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).sample(
+                                        tokenizer=tokenizer,
+                                        num_requests=args.num_prompts,
+                                        output_len=args.sharegpt_output_len,
+                                    ),
+            "burstgpt":
+            lambda: BurstGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random":
+            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            )
+        }
 
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
     # Avoid GC processing "static" data - reduce pause times.
@@ -958,7 +647,6 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
-            best_of=args.best_of,
             request_rate=args.request_rate,
             burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
@@ -983,7 +671,6 @@ def main(args: argparse.Namespace):
         result_json["backend"] = backend
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
@@ -1081,13 +768,6 @@ def main(args: argparse.Namespace):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and "
-        "returns the best one.",
-    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--num-prompts",
@@ -1312,4 +992,5 @@ def main(args: argparse.Namespace):
                         "script chooses a LoRA module at random.")
 
     args = parser.parse_args()
+
     main(args)
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_structured_output.py
similarity index 90%
rename from benchmarks/benchmark_serving_guided.py
rename to benchmarks/benchmark_serving_structured_output.py
index 6c132d05f1b6..dccef9d96d05 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-r"""Benchmark online serving throughput with guided decoding.
+r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
     (vLLM OpenAI API server)
@@ -9,12 +9,12 @@
     ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 
 On the client side, run:
-    python benchmarks/benchmark_serving_guided.py \
+    python benchmarks/benchmark_serving_structured_output.py \
         --backend <backend> \
         --model <your_model> \
         --dataset json \
-        --guided-decoding-ratio 1.0 \
-        --guided-decoding-backend xgrammar \
+        --structured-output-ratio 1.0 \
+        --structured-output-backend xgrammar \
         --request-rate 10 \
         --num-prompts 1000
 
@@ -24,11 +24,13 @@
 """
 import argparse
 import asyncio
+import copy
 import dataclasses
 import json
 import os
 import random
 import time
+import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
@@ -52,6 +54,9 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+from vllm.v1.structured_output.utils import (
+    has_xgrammar_unsupported_json_features)
+
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
@@ -106,24 +111,43 @@ class SampleRequest:
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json':
+    if args.dataset == 'json' or args.dataset == 'json-unique':
         if args.json_schema_path is None:
             dir_path = os.path.dirname(os.path.realpath(__file__))
             args.json_schema_path = os.path.join(dir_path,
                                                  "structured_schemas",
                                                  "structured_schema_1.json")
+        json_schemas = []
         with open(args.json_schema_path) as f:
             schema = json.load(f)
-        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
+
+        if args.dataset == 'json-unique':
+            json_schemas = [
+                copy.deepcopy(schema) for _ in range(args.num_prompts)
+            ]
+            for i in range(len(json_schemas)):
+                json_schemas[i]["properties"][
+                    f"__optional_field_{uuid.uuid4()}"] = {
+                        "type":
+                        "string",
+                        "description":
+                        "An unique optional field to avoid cached schemas"
+                    }
+
+        def gen_prompt(index: int):
+            schema = json_schemas[index % len(json_schemas)]
+            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+
+        def get_schema(index: int):
+            return json_schemas[index % len(json_schemas)]
+
         requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
+            SampleRequest(prompt=gen_prompt(i),
+                          prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
                           expected_output_len=args.output_len,
-                          schema=schema,
+                          schema=get_schema(i),
                           structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
+            for i in range(args.num_prompts)
         ]
 
     elif args.dataset == "grammar":
@@ -191,7 +215,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
         requests: list[SampleRequest] = []
         dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                         split="train")
-        print(f"dataset has {len(dataset)} entries")
+        full_dataset_len = len(dataset)
+
+        def _filter_func(item):
+            import json
+            schema = json.loads(item["schema"])
+            return not has_xgrammar_unsupported_json_features(schema)
+
+        dataset = dataset.filter(_filter_func)
+        num_filtered_out = full_dataset_len - len(dataset)
+        print(f"dataset has {len(dataset)} entries after filtering "
+              f"out {num_filtered_out} entries with unsupported features")
         len_dataset = len(dataset)
         for data_point_idx in range(args.num_prompts):
             idx = data_point_idx
@@ -220,21 +254,21 @@ async def get_request(
     burstiness: float = 1.0,
 ) -> AsyncGenerator[tuple[int, SampleRequest], None]:
     """
-    Asynchronously generates requests at a specified rate 
+    Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
-    
+
     Args:
-        input_requests: 
+        input_requests:
             A list of input requests, each represented as a tuple.
-        request_rate: 
+        request_rate:
             The rate at which requests are generated (requests/s).
-        burstiness (optional): 
-            The burstiness factor of the request generation. 
+        burstiness (optional):
+            The burstiness factor of the request generation.
             Only takes effect when request_rate is not inf.
             Default value is 1, which follows a Poisson process.
             Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results 
-            in more bursty requests, while a higher burstiness value 
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
     input_requests = iter(input_requests)
@@ -378,8 +412,8 @@ async def benchmark(
     selected_percentiles: list[str],
     ignore_eos: bool,
     max_concurrency: Optional[int],
-    guided_decoding_ratio: float,
-    guided_decoding_backend: str,
+    structured_output_ratio: float,
+    structured_output_backend: str,
     goodput_config_dict: Optional[dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
@@ -391,16 +425,18 @@ def prepare_extra_body(request) -> dict:
         extra_body = {}
         # Add the schema to the extra_body
         extra_body[request.structure_type] = request.schema
-        # Add the specific guided_decoding_backend
-        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        # Add the specific structured_output_backend
+        extra_body["guided_decoding_backend"] = structured_output_backend
         return extra_body
 
     print("Starting initial single prompt test run...")
-    guided_decoding_req_idx = random.sample(
+    structured_output_req_idx = random.sample(
         range(len(input_requests)),
-        int(len(input_requests) * guided_decoding_ratio))
+        int(len(input_requests) * structured_output_ratio))
 
     test_request = input_requests[0]
+    test_req_extra_body = (prepare_extra_body(test_request)
+                           if 0 in structured_output_req_idx else None)
     test_input = RequestFuncInput(
         model=model_id,
         prompt=test_request.prompt,
@@ -408,7 +444,7 @@ def prepare_extra_body(request) -> dict:
         prompt_len=test_request.prompt_len,
         output_len=test_request.expected_output_len,
         ignore_eos=ignore_eos,
-        extra_body=prepare_extra_body(test_request),
+        extra_body=test_req_extra_body,
     )
     test_output = await request_func(request_func_input=test_input)
     if not test_output.success:
@@ -427,7 +463,7 @@ def prepare_extra_body(request) -> dict:
             prompt_len=test_request.prompt_len,
             output_len=test_request.expected_output_len,
             ignore_eos=ignore_eos,
-            extra_body=prepare_extra_body(test_request),
+            extra_body=test_req_extra_body,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -465,7 +501,7 @@ async def limited_request_func(request_func_input, pbar):
     async for i, request in get_request(input_requests, request_rate,
                                         burstiness):
         extra_body = prepare_extra_body(
-            request) if i in guided_decoding_req_idx else None
+            request) if i in structured_output_req_idx else None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=request.prompt,
@@ -708,10 +744,10 @@ def main(args: argparse.Namespace):
     else:
         args.structure_type = 'guided_json'
 
-    if args.no_guided_decoding:
-        args.guided_decoding_ratio = 0
+    if args.no_structured_output:
+        args.structured_output_ratio = 0
     if args.save_results:
-        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name = f'{args.structured_output_ratio}guided'
         result_file_name += f"_{backend}"
         result_file_name += f"_{args.request_rate}qps"
         result_file_name += f"_{args.model.split('/')[-1]}"
@@ -744,8 +780,8 @@ def main(args: argparse.Namespace):
             ],
             ignore_eos=args.ignore_eos,
             max_concurrency=args.max_concurrency,
-            guided_decoding_ratio=args.guided_decoding_ratio,
-            guided_decoding_backend=args.guided_decoding_backend,
+            structured_output_ratio=args.structured_output_ratio,
+            structured_output_backend=args.structured_output_backend,
             goodput_config_dict=goodput_config_dict,
         ))
 
@@ -806,10 +842,12 @@ def main(args: argparse.Namespace):
         default="/v1/completions",
         help="API endpoint.",
     )
-    parser.add_argument(
-        "--dataset",
-        default='json',
-        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--dataset",
+                        default='json',
+                        choices=[
+                            'json', 'json-unique', 'grammar', 'regex',
+                            'choice', 'xgrammar_bench'
+                        ])
     parser.add_argument("--json_schema_path",
                         type=str,
                         default=None,
@@ -943,19 +981,20 @@ def main(args: argparse.Namespace):
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
 
-    parser.add_argument("--no-guided-decoding",
+    parser.add_argument("--no-structured-output",
                         action='store_true',
                         default=False,
                         help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--guided-decoding-ratio",
+    parser.add_argument("--structured-output-ratio",
                         type=float,
                         default=1.0,
-                        help="Ratio of Guided Decoding requests")
-    parser.add_argument("--guided-decoding-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for guided decoding")
+                        help="Ratio of Structured Outputs requests")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "json-unique"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index aabce64ff776..7e6556733b28 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,13 +6,14 @@
 import os
 import random
 import time
-from functools import cache
-from typing import Any, Optional
+import warnings
+from typing import Any, Optional, Union
 
 import torch
 import uvloop
+from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
+                               ShareGPTDataset, SonnetDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -20,154 +21,17 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
-from vllm.inputs import TextPrompt
+from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
-@dataclasses.dataclass
-class SampleRequest:
-    """A class representing a single inference request for benchmarking.
-
-    Attributes:
-        prompt: The input text prompt for the model.
-        prompt_len: The length of the prompt in tokens.
-        expected_output_len: The expected length of the output in tokens.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
-        lora_request: Optional LoRARequest specifying the LoRA to use. 
-    """
-    prompt: str
-    prompt_len: int
-    expected_output_len: int
-    multi_modal_data: Optional[MultiModalDataDict] = None
-    lora_request: Optional[LoRARequest] = None
-
-
-def _get_prompt_for_image_model(question: str, *, model: str) -> str:
-    """Prepend and append special tokens around the question to form a prompt.
-
-    Args:
-        question: The input question text to wrap with special tokens
-        model: The name of the model being used, to determine which special
-            tokens to add
-
-    Returns:
-        The formatted prompt string with appropriate special tokens for the
-            model
-
-    Raises:
-        ValueError: If an unsupported model name is provided
-    """
-    model = model.lower()
-    if "pixtral" in model:
-        return f"<s>[INST]{question}\n[IMG][/INST]"
-    raise ValueError(f"Unsupported model {model}")
-
-
-@cache
-def lora_path_on_disk(lora_path: str) -> str:
-    return get_adapter_absolute_path(lora_path)
-
-
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
-
-
-def get_random_lora_request(
-        args: argparse.Namespace
-) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
-    global lora_tokenizer_cache
-    lora_id = random.randint(1, args.max_loras)
-    lora_request = LoRARequest(lora_name=str(lora_id),
-                               lora_int_id=lora_id,
-                               lora_path=lora_path_on_disk(args.lora_path))
-    if lora_id not in lora_tokenizer_cache:
-        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
-    return lora_request, lora_tokenizer_cache[lora_id]
-
-
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> list[SampleRequest]:
-
-    dataset_path: str = args.dataset
-    num_requests: int = args.num_prompts
-    fixed_output_len: Optional[int] = args.output_len
-    model: str = args.model
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: list[SampleRequest] = []
-    for data in tqdm(dataset,
-                     total=len(filtered_dataset),
-                     desc="sampling requests"):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Only keep the first two turns of each conversation.
-        prompt = data["conversations"][0]["value"]
-        completion = data["conversations"][1]["value"]
-
-        multi_modal_data: Optional[MultiModalDataDict] = None
-        if "image" in data:
-            multi_modal_data = multi_modal_data or {}
-            image_path = data["image"]
-            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
-            assert isinstance(image_path,
-                              str), "Only support single image input"
-            try:
-                multi_modal_data["image"] = Image.open(image_path).convert(
-                    "RGB")
-            except FileNotFoundError:
-                # Ignore datapoint where asset is missing
-                continue
-            prompt = _get_prompt_for_image_model(question=prompt, model=model)
-
-        request_tokenizer = tokenizer
-        lora_request: Optional[LoRARequest] = None
-        if args.enable_lora:
-            lora_request, lora_tokenizer = get_random_lora_request(args)
-            if lora_tokenizer:
-                request_tokenizer = lora_tokenizer
-
-        # Tokenize the prompts and completions.
-        prompt_token_ids = request_tokenizer(prompt).input_ids
-        completion_token_ids = request_tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append(
-            SampleRequest(prompt=prompt,
-                          prompt_len=prompt_len,
-                          expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data,
-                          lora_request=lora_request))
-
-    return filtered_dataset
-
-
 def run_vllm(
     requests: list[SampleRequest],
     n: int,
     engine_args: EngineArgs,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
@@ -178,10 +42,13 @@ def run_vllm(
             "Please ensure that max_model_len is greater than the sum of"
             " prompt_len and expected_output_len for all requests.")
     # Add the requests to the engine.
-    prompts: list[TextPrompt] = []
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
     sampling_params: list[SamplingParams] = []
     for request in requests:
         prompts.append(
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                       multi_modal_data=request.multi_modal_data)
+            if "prompt_token_ids" in request.prompt else \
             TextPrompt(prompt=request.prompt,
                        multi_modal_data=request.multi_modal_data))
         sampling_params.append(
@@ -191,6 +58,7 @@ def run_vllm(
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
             ))
     lora_requests: Optional[list[LoRARequest]] = None
     if engine_args.enable_lora:
@@ -229,6 +97,7 @@ async def run_vllm_async(
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import SamplingParams
 
@@ -242,11 +111,14 @@ async def run_vllm_async(
                 " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
-        prompts: list[TextPrompt] = []
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
         sampling_params: list[SamplingParams] = []
         lora_requests: list[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                        multi_modal_data=request.multi_modal_data)
+                if "prompt_token_ids" in request.prompt else \
                 TextPrompt(prompt=request.prompt,
                            multi_modal_data=request.multi_modal_data))
             sampling_params.append(
@@ -256,6 +128,7 @@ async def run_vllm_async(
                     top_p=1.0,
                     ignore_eos=True,
                     max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
                 ))
             lora_requests.append(request.lora_request)
 
@@ -282,6 +155,7 @@ def run_hf(
     n: int,
     max_batch_size: int,
     trust_remote_code: bool,
+    disable_detokenize: bool = False,
 ) -> float:
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
@@ -321,8 +195,9 @@ def run_hf(
             use_cache=True,
             max_new_tokens=max_output_len,
         )
-        # Include the decoding time.
-        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
         pbar.update(len(batch))
 
         # Clear the batch.
@@ -369,56 +244,50 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         write_to_json(pt_file, pt_records)
 
 
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset.")
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
 def main(args: argparse.Namespace):
+    if args.seed is None:
+        args.seed = 0
     print(args)
     random.seed(args.seed)
-
     # Sample the requests.
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
-    if args.dataset is None:
-        vocab_size = tokenizer.vocab_size
-        requests = []
-        for _ in range(args.num_prompts):
-
-            request_tokenizer = tokenizer
-            lora_request: Optional[LoRARequest] = None
-            if args.enable_lora:
-                lora_request, lora_tokenizer = get_random_lora_request(args)
-                if lora_tokenizer:
-                    request_tokenizer = lora_tokenizer
-
-            # Synthesize a prompt with the given input length.
-            candidate_ids = [
-                random.randint(0, vocab_size - 1)
-                for _ in range(args.input_len)
-            ]
-            # As tokenizer may add additional tokens like BOS, we need to try
-            # different lengths to get the desired input length.
-            for _ in range(5):  # Max attempts to correct
-                candidate_prompt = request_tokenizer.decode(candidate_ids)
-                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
-
-                if tokenized_len == args.input_len:
-                    break
-
-                # Adjust length based on difference
-                diff = args.input_len - tokenized_len
-                if diff > 0:
-                    candidate_ids.extend([
-                        random.randint(100, vocab_size - 100)
-                        for _ in range(diff)
-                    ])
-                else:
-                    candidate_ids = candidate_ids[:diff]
-            requests.append(
-                SampleRequest(prompt=candidate_prompt,
-                              prompt_len=args.input_len,
-                              expected_output_len=args.output_len,
-                              lora_request=lora_request))
-    else:
-        requests = sample_requests(tokenizer, args)
-
+    requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
     if args.backend == "vllm":
@@ -429,14 +298,17 @@ def main(args: argparse.Namespace):
                     args.n,
                     AsyncEngineArgs.from_cli_args(args),
                     args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
                 ))
         else:
             elapsed_time = run_vllm(requests, args.n,
-                                    EngineArgs.from_cli_args(args))
+                                    EngineArgs.from_cli_args(args),
+                                    args.disable_detokenize)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code,
+                              args.disable_detokenize)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
@@ -450,7 +322,7 @@ def main(args: argparse.Namespace):
         print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
               "following metrics are not accurate because image tokens are not"
               " counted. See vllm-project/vllm/issues/9778 for details.")
-        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@@ -475,12 +347,23 @@ def main(args: argparse.Namespace):
                         type=str,
                         choices=["vllm", "hf", "mii"],
                         default="vllm")
-    parser.add_argument("--dataset",
+    parser.add_argument("--dataset-name",
+                        type=str,
+                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
+                        help="Name of the dataset to benchmark on.",
+                        default="sharegpt")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--dataset-path",
                         type=str,
                         default=None,
-                        help="Path to the dataset. The dataset is expected to "
-                        "be a json in form of list[dict[..., conversations: "
-                        "list[dict[..., value: <prompt_or_response>]]]]")
+                        help="Path to the dataset")
     parser.add_argument("--input-len",
                         type=int,
                         default=None,
@@ -515,6 +398,11 @@ def main(args: argparse.Namespace):
                         action='store_true',
                         default=False,
                         help="Disable decoupled async engine frontend.")
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize the response (i.e. do not include "
+              "detokenization time in the measurement)"))
     # LoRA
     parser.add_argument(
         "--lora-path",
@@ -522,14 +410,35 @@ def main(args: argparse.Namespace):
         default=None,
         help="Path to the lora adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.")
+    parser.add_argument("--prefix-len",
+                        type=int,
+                        default=None,
+                        help="Number of prefix tokens per request."
+                        "This is for the RandomDataset and SonnetDataset")
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for RandomDataSet.",
+    )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
-    if args.dataset is None:
-        assert args.input_len is not None
-        assert args.output_len is not None
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+    if args.dataset is None and args.dataset_path is None:
+        # for random dataset, the default sampling setting is in
+        # benchmark_dataset.RandomDataset
+        print("When dataset is not set, it will default to random dataset")
     else:
         assert args.input_len is None
     if args.enable_lora:
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index d265c91bfeff..e12d74c01e43 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 5eaeec017053..3c4d6a6aa464 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
         result = torch.nn.functional.linear(x, w)
         result *= scaling
         out_list.append(result)
-    torch.cat(out_list, dim=0)
 
     cat_result = torch.cat(out_list, dim=0)
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 3fa57bd7b233..a661ea9d7e60 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -45,7 +45,6 @@ def terse_type_name(dt):
         torch.float16: "fp16",
         torch.int8: "int8",
         torch.float8_e4m3fn: "fp8",
-        torch.bfloat16: "bf16",
         torch.float: "float",
         torch.int: "int",
     }[dt]
@@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
 
     return lambda: ops.machete_mm(
         a=bt.a,
-        b_q=bt.w_q,
+        b_q=w_q,
         b_type=bt.wtype,
         b_group_scales=bt.w_g_s,
         b_group_zeros=w_g_zp,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c862dec81fcc..9de8d5af6242 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import json
 import time
+from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
 from typing import Any, TypedDict
@@ -40,6 +42,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
+    block_quant_shape: List[int] = None,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -81,8 +84,24 @@ def benchmark_config(
                                dtype=torch.float32)
         w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
     if use_fp8_w8a8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        if block_quant_shape:
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+            E = num_experts
+            N = shard_intermediate_size // 2
+            K = hidden_size
+            factor_for_scale = 1e-2
+            n_tiles_w1 = (2 * N + block_n - 1) // block_n
+            n_tiles_w2 = (K + block_n - 1) // block_n
+            k_tiles_w1 = (K + block_k - 1) // block_k
+            k_tiles_w2 = (N + block_k - 1) // block_k
+            w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1),
+                                  dtype=torch.float32) * factor_for_scale
+            w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2),
+                                  dtype=torch.float32) * factor_for_scale
+        else:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+
         a1_scale = torch.randn(1, dtype=torch.float32)
         a2_scale = torch.randn(1, dtype=torch.float32)
 
@@ -111,6 +130,7 @@ def run():
                 w2_scale=w2_scale,
                 a1_scale=a1_scale,
                 a2_scale=a2_scale,
+                block_shape=block_quant_shape,
             )
 
     # JIT compilation & warmup
@@ -175,7 +195,8 @@ def get_rocm_tuning_space(use_fp16):
     return param_ranges
 
 
-def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]:
+def get_configs_compute_bound(use_fp16,
+                              block_quant_shape) -> list[dict[str, int]]:
     configs: list[BenchmarkConfig] = []
 
     if current_platform.is_rocm():
@@ -204,17 +225,27 @@ def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]:
     for config_values in product(*values):
         config = dict(zip(keys, config_values))
         configs.append(config)
+
+    # Remove configs that are not compatible with fp8 block quantization
+    # BLOCK_SIZE_K must be a multiple of block_k
+    # BLOCK_SIZE_N must be a multiple of block_n
+    if block_quant_shape is not None and not use_fp16:
+        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+        for config in configs[:]:
+            if config["BLOCK_SIZE_K"] % block_k != 0 or config[
+                    "BLOCK_SIZE_N"] % block_n != 0:
+                configs.remove(config)
     return configs
 
 
 def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
-                            search_space, is_fp16):
+                            search_space, is_fp16, topk):
     N1, K1 = shard_intermediate_size, hidden_size
     N2, K2 = hidden_size, shard_intermediate_size // 2
-    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
-                                        is_fp16)
-    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
-                                        is_fp16)
+    pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1,
+                                        search_space, is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2,
+                                        search_space, is_fp16)
     search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
     return search_space
 
@@ -372,6 +403,7 @@ def tune(
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
         search_space: list[dict[str, int]],
+        block_quant_shape: list[int],
     ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
@@ -380,21 +412,24 @@ def tune(
             search_space = prune_rocm_search_space(num_tokens,
                                                    shard_intermediate_size,
                                                    hidden_size, search_space,
-                                                   is_fp16)
+                                                   is_fp16, topk)
 
-        with torch.cuda.device(self.device_id):
+        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
+        ) else nullcontext():
             for config in tqdm(search_space):
                 try:
-                    kernel_time = benchmark_config(config,
-                                                   num_tokens,
-                                                   num_experts,
-                                                   shard_intermediate_size,
-                                                   hidden_size,
-                                                   topk,
-                                                   dtype,
-                                                   use_fp8_w8a8,
-                                                   use_int8_w8a16,
-                                                   num_iters=20)
+                    kernel_time = benchmark_config(
+                        config,
+                        num_tokens,
+                        num_experts,
+                        shard_intermediate_size,
+                        hidden_size,
+                        topk,
+                        dtype,
+                        use_fp8_w8a8,
+                        use_int8_w8a16,
+                        num_iters=20,
+                        block_quant_shape=block_quant_shape)
                 except triton.runtime.autotuner.OutOfResources:
                     # Some configurations may be invalid and fail to compile.
                     continue
@@ -436,8 +471,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
 
 def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
                  shard_intermediate_size: int, hidden_size: int, topk: int,
-                 dtype: torch.dtype, use_fp8_w8a8: bool,
-                 use_int8_w8a16: bool) -> None:
+                 dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool,
+                 block_quant_shape: List[int]) -> None:
     dtype_str = get_config_dtype_str(dtype,
                                      use_int8_w8a16=use_int8_w8a16,
                                      use_fp8_w8a8=use_fp8_w8a8)
@@ -445,7 +480,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
     # NOTE(woosuk): The current naming convention uses w2.shape[2], which
     # is the intermediate size after silu_and_mul.
     filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
+                                    dtype_str, block_quant_shape)
 
     print(f"Writing best config to {filename}...")
     with open(filename, "w") as f:
@@ -455,7 +490,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
 
 def main(args: argparse.Namespace):
     print(args)
-
+    block_quant_shape = None
     config = AutoConfig.from_pretrained(
         args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
@@ -474,6 +509,12 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+        block_quant_shape = config.quantization_config['weight_block_size']
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -511,27 +552,30 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
 
     if args.tune:
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
-        search_space = get_configs_compute_bound(is_fp16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
         print(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()
         configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
-                     for batch_size in batch_sizes])
+            "tune",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
+             for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
         }
         save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+                     topk, dtype, use_fp8_w8a8, use_int8_w8a16,
+                     block_quant_shape)
         end = time.time()
         print(f"Tuning took {end - start:.2f} seconds")
     else:
         outputs = _distribute(
-            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
-                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
-                          for batch_size in batch_sizes])
+            "benchmark",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+             for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
             print(f"Batch size: {batch_size}, config: {config}")
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 5445f7baf1d9..e8e4db875b7d 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -177,7 +177,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 0ddea9390d77..b643897a60ee 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
new file mode 100644
index 000000000000..917e814010f8
--- /dev/null
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -0,0 +1,129 @@
+# DeepSeek DeepGEMM Kernels Benchmark
+
+This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
+
+Currently this just includes dense GEMMs and only works on Hopper GPUs.
+
+## Setup
+
+You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
+
+```
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM
+cd DeepGEMM
+python setup.py install
+uv pip install -e .
+```
+
+## Usage
+
+```
+python benchmark_fp8_block_dense_gemm.py
+INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
+===== STARTING FP8 GEMM BENCHMARK =====
+PyTorch version: 2.5.1+cu124
+CUDA version: 12.4
+Triton version: 3.1.0
+Using device: NVIDIA H100 80GB HBM3
+WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+
+===== PERFORMANCE COMPARISON =====
+
+DeepGEMM Implementation:
++------+-------+-------+-----------+--------+--------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
++------+-------+-------+-----------+--------+--------+
+|    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
+|    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
+|    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
+|   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
+|   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
+|   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
+|   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
+|   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
+|   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
+|  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
+|  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
+|  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
+| 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
+| 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
+| 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
+| 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
++------+-------+-------+-----------+--------+--------+
+
+vLLM Triton Implementation:
++------+-------+-------+-----------+--------+--------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
++------+-------+-------+-----------+--------+--------+--------------+
+|    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
+|    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
+|    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
+|   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
+|   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
+|   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
+|   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
+|   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
+|   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
+|  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
+|  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
+|  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
+| 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
+| 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
+| 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
+| 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
++------+-------+-------+-----------+--------+--------+--------------+
+
+vLLM CUTLASS Implementation:
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+|    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
+|    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
+|    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
+|   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
+|   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
+|   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
+|   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
+|   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
+|   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
+|  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
+|  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
+|  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
+| 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
+| 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
+| 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
+| 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+
+===== AVERAGE PERFORMANCE =====
++----------------+------------+----------+---------------+
+| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
++----------------+------------+----------+---------------+
+| DeepGEMM       | 310.98     | 1052.10  | 0.11          |
+| vLLM Triton    | 144.30     | 715.60   | 0.23          |
+| vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
++----------------+------------+----------+---------------+
+
+===== AVERAGE SPEEDUPS =====
++-----------------------------+--------------+
+| Comparison                  | Speedup      |
++-----------------------------+--------------+
+| DeepGEMM vs vLLM Triton     | 1.71x faster |
+| DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
+| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
++-----------------------------+--------------+
+
+===== ACCURACY COMPARISON =====
++----------------+-----------------------+
+| Implementation | Avg Diff vs Reference |
++----------------+-----------------------+
+| DeepGEMM       | 0.000684              |
+| vLLM Triton    | 0.000684              |
+| vLLM CUTLASS   | 0.000684              |
++----------------+-----------------------+
+```
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
new file mode 100644
index 000000000000..7892f126e7d6
--- /dev/null
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -0,0 +1,464 @@
+# SPDX-License-Identifier: Apache-2.0
+# fmt: off
+# ruff: noqa: E501
+import time
+
+# Import DeepGEMM functions
+import deep_gemm
+import torch
+import triton
+from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
+
+# Import vLLM functions
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-token scaling."""
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
+        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-block scaling."""
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+def benchmark_shape(m: int,
+                    n: int,
+                    k: int,
+                    warmup: int = 100,
+                    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
+    """Benchmark all implementations for a specific (m, n, k) shape."""
+    if verbose:
+        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
+
+    # Create test tensors
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+
+    # Reference result in BF16
+    torch.cuda.synchronize()
+    C_ref = A @ B.t()
+
+    # Pre-quantize B for all implementations
+    # (weights can be pre-quantized offline)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+
+    # Block size configuration
+    block_size = [128, 128]
+
+    # Pre-quantize A for all implementations
+    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True)
+
+    # === DeepGEMM Implementation ===
+    def deepgemm_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+        #     A, block_size[1])
+        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+                                       (B_deepgemm, B_scale_deepgemm),
+                                       C_deepgemm)
+        return C_deepgemm
+
+    # === vLLM Triton Implementation ===
+    def vllm_triton_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+        return w8a8_block_fp8_matmul(A_vllm,
+                                     B_vllm,
+                                     A_scale_vllm,
+                                     B_scale_vllm,
+                                     block_size,
+                                     output_dtype=torch.bfloat16)
+
+    # === vLLM CUTLASS Implementation ===
+    def vllm_cutlass_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        #     A, block_size[1], column_major_scales=True)
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
+                                     B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
+
+    # Run correctness check first
+    if verbose:
+        print("Running correctness check...")
+    C_deepgemm = deepgemm_gemm()
+    C_vllm_triton = vllm_triton_gemm()
+    C_vllm_cutlass = vllm_cutlass_gemm()
+
+    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
+    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
+    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
+
+    if verbose:
+        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
+        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
+        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
+        print("vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
+        print("vLLM CUTLASS vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+
+    # Benchmark implementations
+    implementations = {
+        "DeepGEMM": deepgemm_gemm,
+        "vLLM Triton": vllm_triton_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
+    }
+
+    benchmark_results = {
+        "shape": {
+            "m": m,
+            "n": n,
+            "k": k
+        },
+        "implementations": {}
+    }
+
+    for name, func in implementations.items():
+        # Warmup
+        for _ in range(warmup):
+            func()
+            torch.cuda.synchronize()
+
+        # Timing loop
+        torch.cuda.synchronize()
+        start = time.time()
+        for _ in range(repeat):
+            func()
+        torch.cuda.synchronize()
+        end = time.time()
+
+        # Calculate timing and TFLOPS
+        avg_time_ms = (end - start) / repeat * 1000
+        avg_time_us = avg_time_ms * 1000
+        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
+        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
+
+        benchmark_results["implementations"][name] = {
+            "time_ms": avg_time_ms,
+            "time_us": avg_time_us,
+            "tflops": tflops,
+            "gb_s": gb_s,
+            "diff": {
+                "DeepGEMM":
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
+                "Reference":
+                deepgemm_diff if name == "DeepGEMM" else
+                (vllm_triton_diff
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
+            }
+        }
+
+        if verbose:
+            print(
+                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
+            )
+
+    # Calculate speedups
+    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
+    for name, data in benchmark_results["implementations"].items():
+        if name != "DeepGEMM":
+            speedup = baseline / data["time_ms"]
+            benchmark_results["implementations"][name][
+                "speedup_vs_deepgemm"] = speedup
+            if verbose:
+                print(f"DeepGEMM is {1/speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
+        "time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
+        "time_ms"]
+    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
+    benchmark_results["implementations"]["vLLM CUTLASS"][
+        "speedup_vs_triton"] = cutlass_vs_triton
+    if verbose:
+        print(
+            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
+            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
+        )
+
+    return benchmark_results
+
+
+def format_table_row(values, widths):
+    """Format a row with specified column widths."""
+    return "| " + " | ".join(f"{val:{w}}"
+                             for val, w in zip(values, widths)) + " |"
+
+
+def print_table(headers, rows, title=None):
+    """Print a table with headers and rows."""
+    if title:
+        print(f"\n{title}")
+
+    # Calculate column widths based on headers and data
+    widths = [
+        max(len(str(h)), max(len(str(row[i])) for row in rows))
+        for i, h in enumerate(headers)
+    ]
+
+    # Create separator line
+    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
+
+    # Print table
+    print(separator)
+    print(format_table_row(headers, widths))
+    print(separator)
+    for row in rows:
+        print(format_table_row(row, widths))
+    print(separator)
+
+
+def format_speedup(value):
+    """Format speedup value with indicator if it's faster or slower."""
+    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
+
+
+def run_benchmarks(verbose: bool = False):
+    """Run benchmarks for a set of common shapes."""
+    print("===== STARTING FP8 GEMM BENCHMARK =====")
+
+    # Make sure we're using the GPU
+    if not torch.cuda.is_available():
+        print("CUDA not available! Tests require GPU.")
+        return
+
+    # Print system information
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Triton version: {triton.__version__}")
+    print(f"Using device: {torch.cuda.get_device_name()}")
+
+    # Enable TF32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # Set seeds for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    # Define benchmark shapes (m, n, k)
+    shapes = [
+        (8, 4096, 7168),
+        (8, 7168, 18432),
+        (8, 18432, 7168),
+        (64, 4096, 7168),
+        (64, 7168, 18432),
+        (64, 18432, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 18432),
+        (128, 18432, 7168),
+        (1024, 4096, 7168),
+        (1024, 18432, 7168),
+        (2048, 4096, 7168),
+        (4096, 4096, 7168),
+    ]
+    shapes = [
+        # (64, 2112, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (64, 4096, 7168),
+        (64, 7168, 2048),
+        # (128, 2112, 7168),
+        (128, 24576, 1536),
+        (128, 32768, 512),
+        (128, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 2048),
+        # (4096, 2112, 7168),
+        (4096, 24576, 1536),
+        (4096, 32768, 512),
+        (4096, 7168, 16384),
+        (4096, 4096, 7168),
+        (4096, 7168, 2048),
+    ]
+
+    all_results = []
+    for m, n, k in shapes:
+        result = benchmark_shape(m, n, k, verbose=verbose)
+        all_results.append(result)
+
+    # Print results in a nicely formatted table
+    print("\n===== PERFORMANCE COMPARISON =====")
+
+    # Print DeepGEMM table
+    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
+    deepgemm_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["DeepGEMM"]
+        deepgemm_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
+        ])
+
+    print_table(deepgemm_headers,
+                deepgemm_rows,
+                title="DeepGEMM Implementation:")
+
+    # Print vLLM Triton table
+    triton_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
+    ]
+    triton_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM Triton"]
+        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
+        triton_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(speedup)
+        ])
+
+    print_table(triton_headers,
+                triton_rows,
+                title="vLLM Triton Implementation:")
+
+    # Print vLLM CUTLASS table
+    cutlass_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
+        "vs Triton"
+    ]
+    cutlass_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM CUTLASS"]
+        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
+        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
+        cutlass_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(vs_deepgemm),
+            format_speedup(vs_triton)
+        ])
+
+    print_table(cutlass_headers,
+                cutlass_rows,
+                title="vLLM CUTLASS Implementation:")
+
+    # Calculate and print averages
+    print("\n===== AVERAGE PERFORMANCE =====")
+
+    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
+    avg_metrics = {
+        impl: {
+            "tflops": 0,
+            "gb_s": 0,
+            "time_ms": 0
+        }
+        for impl in implementations
+    }
+
+    for result in all_results:
+        for impl in implementations:
+            impl_data = result["implementations"][impl]
+            avg_metrics[impl]["tflops"] += impl_data["tflops"]
+            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
+            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
+
+    num_shapes = len(all_results)
+    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
+    avg_rows = []
+
+    for impl in implementations:
+        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
+        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
+        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
+        avg_rows.append([
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
+        ])
+
+    print_table(avg_headers, avg_rows)
+
+    # Calculate average speedups
+    avg_speedups = {
+        "DeepGEMM vs vLLM Triton": 0,
+        "DeepGEMM vs vLLM CUTLASS": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
+    }
+
+    for result in all_results:
+        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
+        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
+            "time_ms"]
+
+        avg_speedups[
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups[
+            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+
+    print("\n===== AVERAGE SPEEDUPS =====")
+    speedup_headers = ["Comparison", "Speedup"]
+    speedup_rows = []
+    for comparison, total in avg_speedups.items():
+        avg_speedup = total / num_shapes
+        status = "faster" if avg_speedup > 1 else "slower"
+        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
+
+    print_table(speedup_headers, speedup_rows)
+
+    # Average accuracy comparison
+    print("\n===== ACCURACY COMPARISON =====")
+    avg_diff = {impl: 0 for impl in implementations}
+
+    for result in all_results:
+        for impl in implementations:
+            avg_diff[impl] += result["implementations"][impl]["diff"][
+                "Reference"]
+
+    diff_headers = ["Implementation", "Avg Diff vs Reference"]
+    diff_rows = []
+    for impl in implementations:
+        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
+
+    print_table(diff_headers, diff_rows)
+
+
+if __name__ == "__main__":
+    run_benchmarks(verbose=False)
diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh
new file mode 100755
index 000000000000..8a777320f735
--- /dev/null
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Define the model to use
+MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"}
+
+# Define the backend to use
+BACKEND=${2:-"vllm"}
+
+# Define the dataset to use
+DATASET=${3:-"xgrammar_bench"}
+
+# Define the guided decoding backend
+GUIDED_BACKEND=${4:-"xgrammar"}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+
+GUIDED_RATIO=${6:-0.5}
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+
+# Define QPS values to test
+QPS_VALUES=(70 60 50 25 20 15 10)
+
+# Common parameters
+COMMON_PARAMS="--backend $BACKEND \
+               --model $MODEL \
+               --dataset $DATASET \
+               --structured-output-backend $GUIDED_BACKEND \
+               --structured-output-ratio $GUIDED_RATIO \
+               --save-results \
+               --result-dir $OUTPUT_DIR"
+
+echo "Starting structured output benchmark with model: $MODEL"
+echo "Backend: $BACKEND"
+echo "Dataset: $DATASET"
+echo "Structured output backend: $GUIDED_BACKEND"
+echo "Results will be saved to: $OUTPUT_DIR"
+echo "----------------------------------------"
+
+# Run benchmarks with different QPS values
+for qps in "${QPS_VALUES[@]}"; do
+  echo "Running benchmark with QPS: $qps"
+
+  # Get git hash and branch for the filename
+  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+
+  # Construct filename for this run
+  FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+
+  # Run the benchmark
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+    --request-rate $qps \
+    --result-filename "$FILENAME" \
+    --port ${PORT:-8000}
+
+  echo "Completed benchmark with QPS: $qps"
+  echo "----------------------------------------"
+done
+
+echo "All benchmarks completed!"
+echo "Results saved to: $OUTPUT_DIR"
diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
index 6003698469e8..1bd189c9e704 100644
--- a/benchmarks/structured_schemas/structured_schema_1.json
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -1,113 +1,25 @@
 {
-    "$schema":
-    "https://json-schema.org/draft/2020-12/schema",
-    "title":
-    "User Profile",
-    "type":
-    "object",
+  "type": "array",
+  "items": {
+    "type": "object",
     "properties": {
-        "userId": {
-            "type": "string",
-            "description": "Unique identifier for the user."
-        },
-        "personalInfo": {
-            "type": "object",
-            "properties": {
-                "firstName": {
-                    "type": "string",
-                    "description": "The user's first name."
-                },
-                "lastName": {
-                    "type": "string",
-                    "description": "The user's last name."
-                },
-                "age": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "description": "The user's age."
-                },
-                "phoneNumbers": {
-                    "type":
-                    "array",
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "type": {
-                                "type": "string",
-                                "enum": ["home", "work", "mobile"],
-                                "description": "Type of phone number."
-                            },
-                            "number": {
-                                "type": "string",
-                                "pattern": "^\\+?[1-9]\\d{1,14}$",
-                                "description": "Phone number in E.164 format."
-                            }
-                        },
-                        "required": ["type", "number"]
-                    },
-                    "description":
-                    "List of phone numbers associated with the user."
-                }
-            },
-            "required": ["firstName", "lastName"]
-        },
-        "address": {
-            "type": "object",
-            "properties": {
-                "street": {
-                    "type": "string",
-                    "description": "Street address."
-                },
-                "city": {
-                    "type": "string",
-                    "description": "City name."
-                },
-                "state": {
-                    "type": "string",
-                    "description": "State or province."
-                },
-                "postalCode": {
-                    "type": "string",
-                    "pattern": "^\\d{5}(-\\d{4})?$",
-                    "description": "Postal code."
-                },
-                "country": {
-                    "type": "string",
-                    "description": "Country name."
-                }
-            },
-            "required": ["street", "city", "state", "postalCode", "country"]
-        },
-        "preferences": {
-            "type": "object",
-            "properties": {
-                "newsletterSubscribed": {
-                    "type":
-                    "boolean",
-                    "description":
-                    "Indicates if the user is subscribed to the newsletter."
-                },
-                "favoriteCategories": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    },
-                    "description": "List of user's favorite categories."
-                }
-            },
-            "required": ["newsletterSubscribed"]
-        },
-        "accountStatus": {
-            "type": "string",
-            "enum": ["active", "inactive", "suspended"],
-            "description": "Current status of the user's account."
-        },
-        "registrationDate": {
-            "type": "string",
-            "format": "date-time",
-            "description": "ISO 8601 formatted date-time of user registration."
-        }
+      "name": { "type": "string" },
+      "race": { "type": "string" },
+      "class": { "type": "string" },
+      "level": { "type": "integer" },
+      "background": { "type": "string" },
+      "alignment": { "type": "string" },
+      "backstory": { "type": "string" }
     },
-    "required":
-    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
-}
\ No newline at end of file
+    "required": [
+      "name",
+      "race",
+      "class",
+      "level",
+      "background",
+      "alignment",
+      "backstory"
+    ]
+  }
+}
+
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 714abca2a5ff..ca2ffb1bc3c8 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -81,6 +81,7 @@ else()
     find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
     find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
     find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+    find_isa(${CPUINFO} "S390" S390_FOUND)
 endif()
 
 
@@ -129,8 +130,16 @@ elseif (ASIMD_FOUND)
 elseif(APPLE_SILICON_FOUND)
     message(STATUS "Apple Silicon Detected")
     set(ENABLE_NUMA OFF)
+elseif (S390_FOUND)
+    message(STATUS "S390 detected")
+    # Check for S390 VXE support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvx"
+        "-mzvector"
+        "-march=native"
+        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
 endif()
 
 #
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index ef6261fa6d9b..f2d01099097a 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
+          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@@ -64,4 +64,4 @@ install(
   DESTINATION vllm_flash_attn
   COMPONENT _vllm_fa3_C
   FILES_MATCHING PATTERN "*.py"
-)
\ No newline at end of file
+)
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index b9764056e8a2..0257d8ff16ba 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -24,8 +24,8 @@ struct KernelVecType<float> {
 
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
-  // Power architecture-specific vector types
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
   using q_load_vec_type = vec_op::FP32Vec8;
   using k_load_vec_type = vec_op::FP32Vec16;
   using v_load_vec_type = vec_op::FP32Vec16;
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index a71815106133..17bbe04eef94 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -7,6 +7,9 @@
 #elif defined(__POWER9_VECTOR__)
   // ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__s390x__)
+  // s390 implementation
+  #include "cpu_types_vxe.hpp"
 #elif defined(__aarch64__)
   // arm implementation
   #include "cpu_types_arm.hpp"
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
new file mode 100644
index 000000000000..ab8cbbbf4ec4
--- /dev/null
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -0,0 +1,480 @@
+
+#ifndef CPU_TYPES_VXE_HPP
+#define CPU_TYPES_VXE_HPP
+
+#include <vecintrin.h>
+#include <cmath>
+#include <torch/all.h>
+namespace vec_op {
+
+#define vec_neg(a) (-(a))
+#define vec_add(a, b) ((a) + (b))
+#define vec_sub(a, b) ((a) - (b))
+#define vec_mul(a, b) ((a) * (b))
+#define vec_div(a, b) ((a) / (b))
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
+#define vec_sl(a, b) ((a) << (b))  // Vector Shift Left
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {}
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void* ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
+
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4& data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+namespace c10 {
+struct BFloat16 {
+  uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
+                   // value.
+};
+}  // namespace c10
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+  #define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
+                                             18, 19, 22, 23, 26, 27, 30, 31};
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel2 =
+      vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel3 =
+      vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
+  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+}
+
+inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+
+};  // namespace vec_op
+
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index 33b163783288..6751e7e55fc5 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -25,7 +25,7 @@ struct KernelVecType<c10::BFloat16> {
 
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
+#if defined(__powerpc64__) || defined(__s390x__)
   // Power architecture-specific vector type
   using load_vec_type = vec_op::FP32Vec16;
 #else
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 1a0cd45f4e20..0a812dc56a99 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -22,7 +22,7 @@ struct identity {
   T operator()(T lhs) const { return lhs; }
 };
 
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct TrivialEpilogue {
  private:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
@@ -44,32 +44,30 @@ struct TrivialEpilogue {
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBase {
  protected:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
 
   template <typename T>
   using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
+      0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
 
   template <typename T>
   using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
+      0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+      0 /*Stages*/, TileShape, T, T, Stride<Int<1>, Int<0>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+      0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
@@ -116,11 +114,11 @@ struct ScaledEpilogueBase {
    the A and B operands respectively. These scales may be either per-tensor or
    per row or column.
 */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -160,11 +158,11 @@ struct ScaledEpilogue
  * The bias tensor must be per-output channel.
  * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -203,11 +201,11 @@ struct ScaledEpilogueBias
  * bias is a column vector instead of a row vector. Useful e.g. if we are
  * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueColumnBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -249,11 +247,11 @@ struct ScaledEpilogueColumnBias
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -314,11 +312,11 @@ struct ScaledEpilogueBiasAzp
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index 928a9500cbb0..d922a3349e1e 100644
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -402,7 +402,7 @@ struct CollectiveMma<
 
     // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
     TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
-      Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
     TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
       Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
     ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
index 69a3f64cb0b0..26de32ce2b16 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -16,6 +16,7 @@
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
 
 #include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
@@ -64,22 +65,28 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                          torch::Tensor const& b,
                          EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
+  using ElementC = typename Gemm::ElementC;
   using ElementD = typename Gemm::ElementD;
   using GemmKernel = typename Gemm::GemmKernel;
 
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, cute::Int<1>{}, 0};
-  StrideB b_stride{ldb, cute::Int<1>{}, 0};
-  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;
 
   typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  StrideC c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  StrideD d_stride =
+      cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+  StrideAux aux_stride = d_stride;
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
   auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
@@ -87,10 +94,11 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                                        b_stride};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // auto d_ptr = static_cast<ElementC*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       Gemm::Epilogue::prepare_args(
           std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
+      c_ptr, c_stride, c_ptr, d_stride};
 
   cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
                                   epilogue_args);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index d2f43e2b7a89..8f4df836bcc8 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -40,12 +40,7 @@ struct cutlass_3x_gemm {
       typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                 float>::type;
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
 
   using StrideD = Stride<int64_t, Int<1>, Int<0>>;
   using ElementC = void;
@@ -88,4 +83,65 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm100 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
index 7ede9e067477..85272804774d 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -30,4 +30,10 @@ void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
                                           torch::Tensor const& a_scales,
                                           torch::Tensor const& b_scales);
 
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
new file mode 100644
index 000000000000..cf2cccc913f6
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
new file mode 100644
index 000000000000..468b77d9593b
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
new file mode 100644
index 000000000000..459eb1bb76eb
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
@@ -0,0 +1,34 @@
+#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
+
+#include "cuda_utils.h"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm100 (Blackwell).
+*/
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
similarity index 96%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
index 53921abc951c..bcb91040d5e2 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
@@ -5,9 +5,11 @@
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
+   NVIDIA GPUs with sm90a (Hopper).
 */
 
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -71,3 +73,5 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                         azp, bias);
 }
+
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 6bef55088682..b08386459cbe 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -23,7 +23,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -31,6 +31,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif
 
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
+
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -55,7 +63,7 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                 std::optional<torch::Tensor> const& azp,
                                 std::optional<torch::Tensor> const& bias);
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -86,7 +94,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
   // and at least SM90 (Hopper)
 
 #if defined CUDA_VERSION
-  if (cuda_device_capability >= 90) {
+  if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
     return CUDA_VERSION >= 12000;
   }
 #endif
@@ -116,11 +124,18 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = get_sm_version_num();
-  // Hopper
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+  if (version_num >= 100) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
 
   // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-  if (version_num >= 90) {
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+  if (version_num >= 90 && version_num < 100) {
+    // Hopper
     cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
     return;
   }
@@ -194,7 +209,7 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
 
   int32_t version_num = get_sm_version_num();
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
   if (version_num >= 90) {
     cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
     return;
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 5f0eaf5a973f..1150bd8f2258 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -5,6 +5,7 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include "cuda_compat.h"
+#include "dispatch_utils.h"
 
 #include "ggml-common.h"
 #include "vecdotq.cuh"
@@ -13,7 +14,8 @@
 #include "mmq.cuh"
 
 // Q8 gemv
-static __global__ void quantize_q8_1(const half* __restrict__ x,
+template <typename scalar_t>
+static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                      void* __restrict__ vy, const int kx,
                                      const int kx_padded) {
   const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@@ -28,7 +30,7 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,
   const int ib = i_padded / QK8_1;   // block index
   const int iqs = i_padded % QK8_1;  // quant index
 
-  const float xi = ix < kx ? __half2float(x[iy * kx + ix]) : 0.0f;
+  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
   float amax = fabsf(xi);
   float sum = xi;
 
@@ -51,14 +53,16 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,
   y[ib].ds.y = __float2half(sum);
 }
 
-static void quantize_row_q8_1_cuda(const half* x, void* vy, const int kx,
+template <typename scalar_t>
+static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
                                    const int ky, cudaStream_t stream) {
   const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
   const int block_num_x =
       (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
   const dim3 num_blocks(block_num_x, ky, 1);
   const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+  quantize_q8_1<scalar_t>
+      <<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
@@ -79,101 +83,112 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
   int col = X.sizes()[1];
   const int padded = (col + 512 - 1) / 512 * 512;
   const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
   at::Tensor Y = torch::empty({1, row}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
   options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
   at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
-  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, 1,
-                         stream);
-  switch (type) {
-    case 2:
-      mul_mat_vec_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 3:
-      mul_mat_vec_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 6:
-      mul_mat_vec_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 7:
-      mul_mat_vec_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 8:
-      mul_mat_vec_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 10:
-      mul_mat_vec_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 11:
-      mul_mat_vec_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 12:
-      mul_mat_vec_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 13:
-      mul_mat_vec_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 14:
-      mul_mat_vec_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 16:
-      mul_mat_vec_iq2_xxs_q8_1_cuda((void*)W.data_ptr(),
-                                    (void*)quant_X.data_ptr(),
-                                    (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 17:
-      mul_mat_vec_iq2_xs_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 18:
-      mul_mat_vec_iq3_xxs_q8_1_cuda((void*)W.data_ptr(),
-                                    (void*)quant_X.data_ptr(),
-                                    (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 19:
-      mul_mat_vec_iq1_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 20:
-      mul_mat_vec_iq4_nl_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 21:
-      mul_mat_vec_iq3_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 22:
-      mul_mat_vec_iq2_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 23:
-      mul_mat_vec_iq4_xs_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 29:
-      mul_mat_vec_iq1_m_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-  }
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
+                                     (void*)quant_X.data_ptr(), col, 1, stream);
+    switch (type) {
+      case 2:
+        mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 3:
+        mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 6:
+        mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 7:
+        mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 8:
+        mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 10:
+        mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 11:
+        mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 12:
+        mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 13:
+        mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 14:
+        mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 16:
+        mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 17:
+        mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 18:
+        mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 19:
+        mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 20:
+        mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 21:
+        mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 22:
+        mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 23:
+        mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 29:
+        mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+    }
+  });
   return Y;
 }
 
@@ -184,66 +199,67 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
   int padded = (col + 512 - 1) / 512 * 512;
   int batch = X.sizes()[0];
   const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
   at::Tensor Y = torch::empty({batch, row}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
   options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
   at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
-  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col,
-                         batch, stream);
-
-  switch (type) {
-    case 2:
-      ggml_mul_mat_q4_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 3:
-      ggml_mul_mat_q4_1_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 6:
-      ggml_mul_mat_q5_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 7:
-      ggml_mul_mat_q5_1_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 8:
-      ggml_mul_mat_q8_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 10:
-      ggml_mul_mat_q2_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 11:
-      ggml_mul_mat_q3_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 12:
-      ggml_mul_mat_q4_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 13:
-      ggml_mul_mat_q5_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 14:
-      ggml_mul_mat_q6_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-  }
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, batch, stream);
+
+    switch (type) {
+      case 2:
+        ggml_mul_mat_q4_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 3:
+        ggml_mul_mat_q4_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 6:
+        ggml_mul_mat_q5_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 7:
+        ggml_mul_mat_q5_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 8:
+        ggml_mul_mat_q8_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 10:
+        ggml_mul_mat_q2_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 11:
+        ggml_mul_mat_q3_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 12:
+        ggml_mul_mat_q4_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 13:
+        ggml_mul_mat_q5_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 14:
+        ggml_mul_mat_q6_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+    }
+  });
   return Y;
 }
diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
index c935faa07df0..e2b93680ffb5 100644
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -1,8 +1,8 @@
 // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
               allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 
     const block_q_t  * x = (const block_q_t  *) vx;
@@ -38,7 +38,7 @@ static __device__ __forceinline__ void mul_mat_q(
                    threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
 
 #pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
+        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
             const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
             const int kbxd = kqs / QI8_1;
 
@@ -98,7 +98,7 @@ static __device__ __forceinline__ void mul_mat_q(
             if (row_dst >= nrows_dst) {
                 continue;
             }
-            dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]);
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps];
         }
     }
 }
@@ -113,24 +113,25 @@ static __device__ __forceinline__ void mul_mat_q(
 #define NWARPS_Q4_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
 #endif
 mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_0;
     const int mmq_y  =  MMQ_Y_Q4_0;
     const int nwarps = NWARPS_Q4_0;
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+    mul_mat_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     int mmq_x  =  MMQ_X_Q4_0;
@@ -144,11 +145,11 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -163,24 +164,25 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 #define NWARPS_Q4_1 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
 #endif
 mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_1;
     const int mmq_y  =  MMQ_Y_Q4_1;
     const int nwarps = NWARPS_Q4_1;
 
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+    mul_mat_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     int mmq_x  =  MMQ_X_Q4_1;
@@ -194,11 +196,11 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -213,24 +215,25 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 #define NWARPS_Q5_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
 #endif
 mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_0;
     const int mmq_y  =  MMQ_Y_Q5_0;
     const int nwarps = NWARPS_Q5_0;
 
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+    mul_mat_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q5_0;
@@ -244,11 +247,11 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -263,24 +266,25 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 #define NWARPS_Q5_1 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
 #endif
 mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_1;
     const int mmq_y  =  MMQ_Y_Q5_1;
     const int nwarps = NWARPS_Q5_1;
 
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+    mul_mat_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q5_1;
     const int mmq_y  =  MMQ_Y_Q5_1;
@@ -293,11 +297,11 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -312,24 +316,25 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 #define NWARPS_Q8_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
 #endif
 mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q8_0;
     const int mmq_y  =  MMQ_Y_Q8_0;
     const int nwarps = NWARPS_Q8_0;
 
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+    mul_mat_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q8_0;
     const int mmq_y  =  MMQ_Y_Q8_0;
@@ -342,11 +347,11 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -361,24 +366,25 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 #define NWARPS_Q2_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
 #endif
 mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q2_K;
     const int mmq_y  =  MMQ_Y_Q2_K;
     const int nwarps = NWARPS_Q2_K;
 
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q2_K;
     const int mmq_y  =  MMQ_Y_Q2_K;
@@ -391,11 +397,11 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -410,25 +416,26 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 #define NWARPS_Q3_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
 #endif
 mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 
     const int mmq_x  =  MMQ_X_Q3_K;
     const int mmq_y  =  MMQ_Y_Q3_K;
     const int nwarps = NWARPS_Q3_K;
 
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q3_K;
@@ -442,11 +449,11 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -461,24 +468,25 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 #define NWARPS_Q4_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
 #endif
 mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_K;
     const int mmq_y  =  MMQ_Y_Q4_K;
     const int nwarps = NWARPS_Q4_K;
 
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q4_K;
     const int mmq_y  =  MMQ_Y_Q4_K;
@@ -491,11 +499,11 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -510,24 +518,25 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 #define NWARPS_Q5_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
 #endif
 mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_K;
     const int mmq_y  =  MMQ_Y_Q5_K;
     const int nwarps = NWARPS_Q5_K;
 
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q5_K;
@@ -541,11 +550,11 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -560,24 +569,25 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 #define NWARPS_Q6_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
 #endif
 mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q6_K;
     const int mmq_y  =  MMQ_Y_Q6_K;
     const int nwarps = NWARPS_Q6_K;
 
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q6_K;
     const int mmq_y  =  MMQ_Y_Q6_K;
@@ -590,11 +600,11 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
index b01e939808a3..d83f29745554 100644
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -1,6 +1,6 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, const int ncols, const int nrows) {
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
@@ -33,158 +33,177 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     }
 
     if (threadIdx.x == 0) {
-        dst[row] = __float2half(tmp);
+        dst[row] = tmp;
     }
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+    mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 7c33fea93d6a..72627df24b9a 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -538,6 +538,7 @@ __global__ void Marlin(
     int prob_n,           // output dimension n
     int prob_k,           // reduction dimension k
     int* locks,           // extra global storage for barrier synchronization
+    bool use_atomic_add,  // whether to use atomic add to reduce
     bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
@@ -1542,7 +1543,17 @@ __global__ void Marlin(
          i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh_red[c_sh_rd];
+        if (use_atomic_add && slice_count > 1) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            atomicAdd(&C_half2[a], sh_red_half2[a]);
+          }
+        } else {
+          C[c_gl_wr] = sh_red[c_sh_rd];
+        }
         c_gl_wr += c_gl_wr_delta;
         c_sh_rd += c_sh_rd_delta;
       }
@@ -1644,7 +1655,7 @@ __global__ void Marlin(
           }
           cp_async_fence();
         } else {
-          if (last) {
+          if (last || use_atomic_add) {
             if (s_sh_wr_pred) {
               cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
@@ -1664,7 +1675,7 @@ __global__ void Marlin(
           }
 
         } else {
-          if (last) {
+          if (last || use_atomic_add) {
             cp_async_wait<0>();
             __syncthreads();
             if (threadIdx.x / 32 < thread_n_blocks / 4) {
@@ -1703,8 +1714,8 @@ __global__ void Marlin(
         }
       }
 
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
         if (use_fp32_reduce) {
           global_reduce_fp32(slice_idx == 0, last);
@@ -1713,7 +1724,8 @@ __global__ void Marlin(
         }
         barrier_release(&locks[slice_col], last);
       }
-      if (last)  // only the last block in a slice actually writes the result
+      if (last || use_atomic_add)
+        // only the last block in a slice actuallywrites the result
         write_result();
       slice_row = 0;
       slice_col_par++;
@@ -1768,7 +1780,8 @@ __global__ void Marlin(
                HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);   \
+                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
+                use_fp32_reduce);                                              \
       }                                                                        \
     }
 
@@ -2062,7 +2075,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                vllm::ScalarType const& q_type, bool has_act_order,
                bool is_k_full, bool has_zp, int num_groups, int group_size,
                int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) {
+               int sms, int max_par, bool use_atomic_add, bool use_fp32_reduce,
+               bool is_zp_float) {
   if (has_zp) {
     TORCH_CHECK(
         q_type == vllm::kU4 || q_type == vllm::kU8,
@@ -2243,7 +2257,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& workspace,
                                vllm::ScalarTypeId const& b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp,
+                               bool is_k_full, bool has_zp, bool use_atomic_add,
                                bool use_fp32_reduce, bool is_zp_float) {
   vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
   if (has_zp) {
@@ -2306,19 +2320,34 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
+  torch::Tensor c;
+  if (use_atomic_add) {
+    c = torch::zeros({size_m, size_n}, options);
+  } else {
+    c = torch::empty({size_m, size_n}, options);
+  }
+
+  torch::Tensor a_tmp;
+  bool has_act_order = g_idx.size(0) != 0;
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m, size_k}, options);
+  } else {
+    a_tmp = torch::empty({0}, options);
+  }
 
   // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
   int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par);
   int reduce_n = size_n;
   auto options_fp32 =
       torch::TensorOptions().dtype(at::kFloat).device(a.device());
-  if (!use_fp32_reduce) {
+  if (use_fp32_reduce) {
+    c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
+  } else {
     reduce_max_m = 0;
     reduce_n = 0;
+    c_tmp = torch::empty({0}, options_fp32);
   }
-  torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
 
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
@@ -2339,7 +2368,6 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Detect groupsize and act_order
   int num_groups = -1;
   int group_size = -1;
-  bool has_act_order = g_idx.size(0) != 0;
 
   int rank = b_scales.sizes().size();
   TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
@@ -2407,7 +2435,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
+        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
+        use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
@@ -2416,7 +2445,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
+        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
+        use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
index e4af067915e0..cc50e68b058e 100644
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -126,15 +126,10 @@ struct MacheteKernelTemplate {
            std::is_same_v<ElementSChannel, ElementSToken>),
       "Currently token and channel scales (if present) must be the same type");
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
   // Currently only supports float scales
   using ChTokScalesEpilogue =
       typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
-                                         EpilogueDescriptor>;
+                                         TileShape>;
   static_assert((with_channel_scales || with_token_scales) ||
                     (std::is_same_v<ElementSChannel, float> &&
                      std::is_same_v<ElementSToken, float>),
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 5fb4aec53325..9c8a50332ad0 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -65,12 +65,7 @@ struct cutlass_sparse_3x_gemm {
       typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                 float>::type;
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
 
   using ElementC = void;
   using LayoutC = cutlass::layout::RowMajor;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index daf8b74757c2..f70e009a23f6 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -4,6 +4,7 @@
 #include "core/registration.h"
 
 #include <torch/library.h>
+#include <torch/version.h>
 
 // Note on op signatures:
 // The X_meta signatures are for the meta functions corresponding to op X.
@@ -17,6 +18,15 @@
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
+  //
+
+  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // to override this for many GEMMs with the following tag. Otherwise,
+  // torch.compile will force all input tensors to be contiguous(), which
+  // will break many custom ops that require column-major weight matrices.
+  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
+  // to match exact eager-mode strides.
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
 
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@@ -169,25 +179,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
       "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor");
+      "-> Tensor",
+      {stride_tag});
   ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
 
   // Decompression method for AQLM.
   ops.def(
       "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor");
+      "int[] codebook_partition_sizes) -> Tensor",
+      {stride_tag});
   ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
 
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
+      {stride_tag});
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
   ops.def(
       "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
+      {stride_tag});
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
   // Note about marlin kernel 'workspace' arguments:
@@ -208,7 +222,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
-      "Tensor");
+      "Tensor",
+      {stride_tag});
   // conditionally compiled so impl in source file
 
   // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
@@ -216,7 +231,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
       "Tensor b_scales, Tensor workspace, "
       "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
+      {stride_tag});
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -242,7 +258,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor? channel_scales,"
       "   Tensor? token_scales,"
       "   str?    schedule"
-      ") -> Tensor");
+      ") -> Tensor",
+      {stride_tag});
   ops.def(
       "machete_prepack_B("
       "   Tensor B,"
@@ -261,7 +278,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
       "int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool has_zp, bool use_atomic_add, bool use_fp32_reduce, "
+      "bool is_zp_float) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
@@ -297,7 +316,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
       "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // marlin_qqq_gemm for QQQ.
@@ -305,14 +325,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
       "Tensor s_tok, Tensor s_ch, Tensor s_group, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // CUTLASS nvfp4 block scaled GEMM
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
       "                      Tensor block_scale_a, Tensor block_scale_b,"
-      "                      Tensor alpha) -> ()");
+      "                      Tensor alpha) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -320,7 +342,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -329,7 +352,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -357,7 +381,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                         Tensor bt_nzs,"
       "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()");
+      "                         Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
@@ -413,7 +438,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
-      "-> Tensor");
+      "-> Tensor",
+      {stride_tag});
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.
diff --git a/docs/README.md b/docs/README.md
index 1a44c1341f4f..74e05ce02636 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,7 @@
 
 ```bash
 # Install dependencies.
-pip install -r requirements-docs.txt
+pip install -r ../requirements/docs.txt
 
 # Build the docs.
 make clean
diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md
index 5b79fb18dff5..8b339bf5fd0d 100644
--- a/docs/dev-docker/README.md
+++ b/docs/dev-docker/README.md
@@ -11,7 +11,8 @@ This documentation includes information for running the popular Llama 3.1 series
 The pre-built image includes:
 
 - ROCm™ 6.3.1
-- vLLM 0.6.6
+- HipblasLT 0.13
+- vLLM 0.7.3
 - PyTorch 2.7dev (nightly)
 
 ## Pull latest Docker Image
@@ -20,16 +21,23 @@ Pull the most recent validated docker image with `docker pull rocm/vllm-dev:main
 
 ## What is New
 
+20250305_aiter:
+- AITER improvements
+- Support for FP8 skinny GEMM
+
 20250207_aiter:
 - More performant AITER
 - Bug fixes
+
 20250205_aiter:
 - [AITER](https://github.com/ROCm/aiter) support
 - Performance improvement for custom paged attention
 - Reduced memory overhead bug fix
+
 20250124:
 - Fix accuracy issue with 405B FP8 Triton FA
 - Fixed accuracy issue with TP8
+
 20250117:
 - [Experimental DeepSeek-V3 and DeepSeek-R1 support](#running-deepseek-v3-and-deepseek-r1)
 
@@ -43,14 +51,14 @@ The table below shows performance data where a local inference client is fed req
 
 | Model | Precision | TP Size | Input | Output | Num Prompts | Max Num Seqs | Throughput (tokens/s) |
 |-------|-----------|---------|-------|--------|-------------|--------------|-----------------------|
-| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 15105 |
-|       |           |         | 128   | 4096   | 1500        | 1500         | 10505                 |
-|       |           |         | 500   | 2000   | 2000        | 2000         | 12664                 |
-|       |           |         | 2048  | 2048   | 1500        | 1500         | 8239                  |
-| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 4065 |
-|       |           |         | 128   | 4096   | 1500        | 1500         | 3171                  |
-|       |           |         | 500   | 2000   | 2000        | 2000         | 2985                  |
-|       |           |         | 2048  | 2048   | 500         | 500          | 1999                  |
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 15919.0  |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 12053.3               |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 13089.0               |
+|       |           |         | 2048  | 2048   | 1500        | 1500         | 8352.4                |
+| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 4219.7 |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 3328.7                |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 3109.3                |
+|       |           |         | 2048  | 2048   | 500         | 500          | 2121.7                |
 
 *TP stands for Tensor Parallelism.*
 
@@ -58,40 +66,40 @@ The table below shows performance data where a local inference client is fed req
 
 The table below shows latency measurement, which typically involves assessing the time from when the system receives an input to when the model produces a result.
 
-| Model | Precision | TP Size | Batch Size | Input | Output | MI300X Latency (ms) |
+| Model | Precision | TP Size | Batch Size | Input | Output | MI300X Latency (sec) |
 |-------|-----------|----------|------------|--------|---------|-------------------|
-| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 19088.59 |
-| | | | 2 | 128 | 2048 | 19610.46 |
-| | | | 4 | 128 | 2048 | 19911.30 |
-| | | | 8 | 128 | 2048 | 21858.80 |
-| | | | 16 | 128 | 2048 | 23537.59 |
-| | | | 32 | 128 | 2048 | 25342.94 |
-| | | | 64 | 128 | 2048 | 32548.19 |
-| | | | 128 | 128 | 2048 | 45216.37 |
-| | | | 1 | 2048 | 2048 | 19154.43 |
-| | | | 2 | 2048 | 2048 | 19670.60 |
-| | | | 4 | 2048 | 2048 | 19976.32 |
-| | | | 8 | 2048 | 2048 | 22485.63 |
-| | | | 16 | 2048 | 2048 | 25246.27 |
-| | | | 32 | 2048 | 2048 | 28967.08 |
-| | | | 64 | 2048 | 2048 | 39920.41 |
-| | | | 128 | 2048 | 2048 | 59514.25 |
-| Llama 3.1 405B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 51739.70 |
-| | | | 2 | 128 | 2048 | 52769.15 |
-| | | | 4 | 128 | 2048 | 54557.07 |
-| | | | 8 | 128 | 2048 | 56901.86 |
-| | | | 16 | 128 | 2048 | 60432.12 |
-| | | | 32 | 128 | 2048 | 67353.01 |
-| | | | 64 | 128 | 2048 | 81085.33 |
-| | | | 128 | 128 | 2048 | 116138.51 |
-| | | | 1 | 2048 | 2048 | 52217.76 |
-| | | | 2 | 2048 | 2048 | 53227.47 |
-| | | | 4 | 2048 | 2048 | 55512.44 |
-| | | | 8 | 2048 | 2048 | 59931.41 |
-| | | | 16 | 2048 | 2048 | 66890.14 |
-| | | | 32 | 2048 | 2048 | 80687.64 |
-| | | | 64 | 2048 | 2048 | 108503.12 |
-| | | | 128 | 2048 | 2048 | 168845.50 |
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 17.654 |
+| | | | 2 | 128 | 2048 | 18.269 |
+| | | | 4 | 128 | 2048 | 18.561 |
+| | | | 8 | 128 | 2048 | 20.180  |
+| | | | 16 | 128 | 2048 | 22.541 |
+| | | | 32 | 128 | 2048 | 25.454 |
+| | | | 64 | 128 | 2048 | 33.666 |
+| | | | 128 | 128 | 2048 | 48.466 |
+| | | | 1 | 2048 | 2048 | 17.771 |
+| | | | 2 | 2048 | 2048 | 18.304 |
+| | | | 4 | 2048 | 2048 | 19.173 |
+| | | | 8 | 2048 | 2048 | 21.326 |
+| | | | 16 | 2048 | 2048 | 24.375 |
+| | | | 32 | 2048 | 2048 | 29.284 |
+| | | | 64 | 2048 | 2048 | 40.200 |
+| | | | 128 | 2048 | 2048 | 62.420 |
+| Llama 3.1 405B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 46.632 |
+| | | | 2 | 128 | 2048 | 47.370 |
+| | | | 4 | 128 | 2048 | 49.945 |
+| | | | 8 | 128 | 2048 | 53.010 |
+| | | | 16 | 128 | 2048 | 56.348 |
+| | | | 32 | 128 | 2048 | 65.222 |
+| | | | 64 | 128 | 2048 | 82.688 |
+| | | | 128 | 128 | 2048 | 115.980 |
+| | | | 1 | 2048 | 2048 | 46.918 |
+| | | | 2 | 2048 | 2048 | 48.132 |
+| | | | 4 | 2048 | 2048 | 52.281 |
+| | | | 8 | 2048 | 2048 | 55.874 |
+| | | | 16 | 2048 | 2048 | 61.822 |
+| | | | 32 | 2048 | 2048 | 76.925 |
+| | | | 64 | 2048 | 2048 | 105.400 |
+| | | | 128 | 2048 | 2048 | 162.503 |
 
 *TP stands for Tensor Parallelism.*
 
@@ -357,7 +365,7 @@ docker run -it --rm --ipc=host --network=host --group-add render \
     --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
     --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
     -e VLLM_USE_TRITON_FLASH_ATTN=0 \
-    -e VLLM_FP8_PADDING=0 \
+    -e  VLLM_MLA_DISABLE=1 \
     rocm/vllm-dev:main
 # Online serving
 vllm serve deepseek-ai/DeepSeek-V3 \
@@ -441,13 +449,18 @@ python /app/vllm/benchmarks/benchmark_latency.py --model amd/Llama-3.1-405B-Inst
 
 You should see some performance improvement about the e2e latency.
 
-### AITER
+### AITER use cases
 
-To get [AITER](https://github.com/ROCm/aiter) kernels support, follow the [Docker build steps](#Docker-manifest) using the [aiter_intergration_final](https://github.com/ROCm/vllm/tree/aiter_intergration_final) branch  
-There is a published release candidate image at `rocm/vllm-dev:nightly_aiter_intergration_final_20250130`
+`rocm/vllm-dev:main` image has experimental [AITER](https://github.com/ROCm/aiter) support, and can yield siginficant performance increase for some model/input/output/batch size configurations. To enable the feature make sure the following environment is set: `VLLM_USE_AITER=1`, the default value is `0`. When building your own image follow the [Docker build steps](#Docker-manifest) using the [aiter_integration_final](https://github.com/ROCm/vllm/tree/aiter_integration_final) branch.
 
-To enable the feature make sure the following environment is set: `VLLM_USE_AITER=1`.  
-The default value is `0` in vLLM, but is set to `1` in the aiter docker.
+Some use cases include:
+- amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+- amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+
+```bash
+export VLLM_USE_AITER=1
+python3 /app/vllm/benchmarks/benchmark_latency.py --model amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV -tp 8 --batch-size 256 --input-len 1024 --output-len 128
+```
 
 ## MMLU_PRO_Biology Accuracy Evaluation
 
@@ -482,17 +495,17 @@ To reproduce the release docker:
 ```bash
     git clone https://github.com/ROCm/vllm.git
     cd vllm
-    git checkout c24ea633f928d77582bc85aff922d07f3bca9d78
-    docker build -f Dockerfile.rocm -t <your_tag> --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 .
+    git checkout c0dd5adf68dd997d7d2c3f04da785d7ef9415e36
+    docker build -f Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
 ```
 
-### AITER
+### Building AITER Image
 
-Use Aiter release candidate branch instead:
+Use AITER release candidate branch instead:
 
 ```bash
     git clone https://github.com/ROCm/vllm.git
     cd vllm
-    git checkout aiter_intergration_final
-    docker build -f Dockerfile.rocm -t <your_tag> --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 .
+    git checkout aiter_integration_final
+    docker build -f Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
 ```
diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
index b016531f1e0a..6ace54f66762 100644
Binary files a/docs/source/assets/contributing/dockerfile-stages-dependency.png and b/docs/source/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/docs/source/assets/design/v1/metrics/intervals-1.png b/docs/source/assets/design/v1/metrics/intervals-1.png
new file mode 100644
index 000000000000..fc4ba4400029
Binary files /dev/null and b/docs/source/assets/design/v1/metrics/intervals-1.png differ
diff --git a/docs/source/assets/design/v1/metrics/intervals-2.png b/docs/source/assets/design/v1/metrics/intervals-2.png
new file mode 100644
index 000000000000..03c8b43bb173
Binary files /dev/null and b/docs/source/assets/design/v1/metrics/intervals-2.png differ
diff --git a/docs/source/assets/design/v1/metrics/intervals-3.png b/docs/source/assets/design/v1/metrics/intervals-3.png
new file mode 100644
index 000000000000..547cc6db6b73
Binary files /dev/null and b/docs/source/assets/design/v1/metrics/intervals-3.png differ
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index ab5ea147f4c6..c57f27b49b88 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
 - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index c8046d248506..f55a62ef01b4 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -859,7 +859,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of `PromptUpdateDetails`
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
 with different `full` and `feature` attributes:
 
 ```python
@@ -948,3 +948,35 @@ to register them to the multi-modal registry:
 +                                         dummy_inputs=YourDummyInputsBuilder)
   class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
+
+## Notes
+
+### Inserting feature tokens without replacement
+
+Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
+
+Examples:
+
+- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
+- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Handling prompt updates unrelated to multi-modal data
+
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
+
+Examples:
+
+- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Custom HF processor
+
+Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
+
+Examples:
+
+- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 5f8f5525e52a..a41411831669 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -23,7 +23,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 ## Testing
 
 ```bash
-pip install -r requirements-dev.txt
+pip install -r requirements/dev.txt
 
 # Linting, formatting and static type checking
 pre-commit install --hook-type pre-commit --hook-type commit-msg
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 3d044f890382..d6e597ea9e92 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -4,6 +4,8 @@
 Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 :::
 
+## Profile with PyTorch Profiler
+
 We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
@@ -22,13 +24,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 `export VLLM_RPC_TIMEOUT=1800000`
 :::
 
-## Example commands and usage
+### Example commands and usage
 
-### Offline Inference
+#### Offline Inference
 
 Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
 
-### OpenAI Server
+#### OpenAI Server
 
 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
@@ -39,3 +41,86 @@ benchmark_serving.py:
 ```bash
 python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
 ```
+
+## Profile with NVIDIA Nsight Systems
+
+Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
+The following block is an example for Ubuntu.
+
+```bash
+apt update
+apt install -y --no-install-recommends gnupg
+echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+apt update
+apt install nsight-systems-cli
+```
+
+### Example commands and usage
+
+#### Offline Inference
+
+For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+
+The following is an example using the `benchmarks/benchmark_latency.py` script:
+
+```bash
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
+```
+
+#### OpenAI Server
+
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+
+```bash
+# server
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
+
+# client
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
+```
+
+In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
+
+```
+nsys sessions list
+```
+
+to get the session id in the form of `profile-XXXXX`, then run:
+
+```
+nsys stop --session=profile-XXXXX
+```
+
+to manually kill the profiler and generate your `nsys-rep` report.
+
+#### Analysis
+
+You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
+
+CLI example:
+
+```bash
+nsys stats report1.nsys-rep
+...
+ ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+ --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+     46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+     14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+     12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+      9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+      5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+      4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+      2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+      1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+      0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+... 
+```
+
+GUI example:
+
+<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index 87feb4885685..62816f514c00 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -95,14 +95,14 @@ Notes:
 
 - If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
 - If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
-- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
+- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
 - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
 
 ```console
 mkdir -p ~/.cache/huggingface/hub/
 hf_cache_dir=~/.cache/huggingface/
-docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
 
 :::{note}
diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
index 99b4cb56424c..7d271b1cfb3a 100644
--- a/docs/source/design/huggingface_integration.md
+++ b/docs/source/design/huggingface_integration.md
@@ -14,7 +14,7 @@ Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qw
 
 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
 
-3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
 
    - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
new file mode 100644
index 000000000000..0d74d21a4fbb
--- /dev/null
+++ b/docs/source/design/v1/metrics.md
@@ -0,0 +1,712 @@
+# Metrics
+
+Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
+
+## Objectives
+
+- Achieve parity of metrics between v0 and v1.
+- The priority use case is accessing these metrics via Prometheus as this is what we expect to be used in production environments.
+- Logging support - i.e. printing metrics to the info log - is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
+
+## Background
+
+Metrics in vLLM can be categorized as follows:
+
+1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+
+The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
+
+### v0 Metrics
+
+In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+
+- `vllm:num_requests_running` (Gauge)
+- `vllm:num_requests_swapped` (Gauge)
+- `vllm:num_requests_waiting` (Gauge)
+- `vllm:gpu_cache_usage_perc` (Gauge)
+- `vllm:cpu_cache_usage_perc` (Gauge)
+- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:prompt_tokens_total` (Counter)
+- `vllm:generation_tokens_total` (Counter)
+- `vllm:request_success_total` (Counter)
+- `vllm:request_prompt_tokens` (Histogram)
+- `vllm:request_generation_tokens` (Histogram)
+- `vllm:time_to_first_token_seconds` (Histogram)
+- `vllm:time_per_output_token_seconds` (Histogram)
+- `vllm:e2e_request_latency_seconds` (Histogram)
+- `vllm:request_queue_time_seconds` (Histogram)
+- `vllm:request_inference_time_seconds` (Histogram)
+- `vllm:request_prefill_time_seconds` (Histogram)
+- `vllm:request_decode_time_seconds` (Histogram)
+- `vllm:request_max_num_generation_tokens` (Histogram)
+- `vllm:num_preemptions_total` (Counter)
+- `vllm:cache_config_info` (Gauge)
+- `vllm:lora_requests_info` (Gauge)
+- `vllm:tokens_total` (Counter)
+- `vllm:iteration_tokens_total` (Histogram)
+- `vllm:time_in_queue_requests` (Histogram)
+- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_execute_time_milliseconds` (Histogram)
+- `vllm:request_params_n` (Histogram)
+- `vllm:request_params_max_tokens` (Histogram)
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md).
+
+### Grafana Dashboard
+
+vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+
+The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
+
+- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
+- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
+- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
+- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
+- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
+- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
+- `vllm:request_prompt_tokens` - Request prompt length
+- `vllm:request_generation_tokens` - request generation length
+- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
+- `vllm:request_queue_time_seconds` - Queue Time
+- `vllm:request_prefill_time_seconds` - Requests Prefill Time
+- `vllm:request_decode_time_seconds` - Requests Decode Time
+- `vllm:request_max_num_generation_tokens` - Max Generation Token in Sequence Group
+
+See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+
+### Prometheus Client Library
+
+Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+
+### Multi-process Mode
+
+In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+
+### Built in Python/Process Metrics
+
+The following metrics are supported by default by `prometheus_client`, but the are not exposed with multiprocess mode is used:
+
+- `python_gc_objects_collected_total`
+- `python_gc_objects_uncollectable_total`
+- `python_gc_collections_total`
+- `python_info`
+- `process_virtual_memory_bytes`
+- `process_resident_memory_bytes`
+- `process_start_time_seconds`
+- `process_cpu_seconds_total`
+- `process_open_fds`
+- `process_max_fds`
+
+This is relevant because if we move away from multiprocess mode in v1,
+we get these back. However, it's questionable how relevant these are
+if they don't aggregate these stats for all processes that make up a
+vLLM instance.
+
+### v0 PRs and Issues
+
+For background, these are some of the relevant PRs which added the v0 metrics:
+
+- <gh-pr:1890>
+- <gh-pr:2316>
+- <gh-pr:2730>
+- <gh-pr:4464>
+- <gh-pr:7279>
+
+Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+
+## v1 Design
+
+### v1 PRs
+
+For background, here are the relevant v1 PRs relating to the v1
+metrics issue <gh-issue:10582>:
+
+- <gh-pr:11962>
+- <gh-pr:11973>
+- <gh-pr:10907>
+- <gh-pr:12416>
+- <gh-pr:12478>
+- <gh-pr:12516>
+- <gh-pr:12530>
+- <gh-pr:12561>
+- <gh-pr:12579>
+- <gh-pr:12592>
+- <gh-pr:12644>
+
+### Metrics Collection
+
+In v1, we wish to move computation and overhead out of the engine core
+process to minimize the time between each forward pass.
+
+The overall idea of V1 EngineCore design is:
+- EngineCore is the inner loop. Performance is most critical here
+- AsyncLLM is the outer loop. This is overlapped with GPU execution
+  (ideally), so this is where any "overheads" should be if
+  possible. So AsyncLLM.output_handler_loop is the ideal place for the
+  metrics bookkeeping if possible.
+
+We will achieve this by collecting metrics in the frontend API server,
+and base these metrics on information we can glean from the
+`EngineCoreOutputs` returned by the engine core process to the
+frontend.
+
+### Interval Calculations
+
+Many of our metrics are the time interval between various events in
+the processing of a request. It is best practice to use timestamps
+based on "monotonic time" (`time.monotonic()`) rather than "wall-clock
+time" (`time.time()`) to calculate intervals as the former is
+unaffected by system clock changes (e.g. from NTP).
+
+It's also important to note that monotonic clocks differ between
+processes - each process has its own reference. point. So it is
+meaningless to compare monotonic timestamps from different processes.
+
+Therefore, in order to calculate an interval, we must compare two
+monotonic timestamps from the same process.
+
+### Scheduler Stats
+
+The engine core process will collect some key statistics from the
+scheduler - e.g. the number of requests that were scheduled or waiting
+after the last scheduler pass - and include those statistics in
+`EngineCoreOutputs`.
+
+### Engine Core Events
+
+The engine core will also record the timestamp of certain per-request
+events so that the frontend can calculate the interval between these
+events.
+
+The events are:
+
+- `QUEUED` - when the request was received by the engine core and
+  added to the scheduler queue.
+- `SCHEDULED` - when the request was first scheduled for execution.
+- `PREEMPTED` - the request has been put back in the waiting queue
+  in order to make room for other requests to complete. It will be
+  re-scheduled in future and re-start its prefill phase.
+- `NEW_TOKENS` - when the output included in `EngineCoreOutput` was
+  generated. Since this is common to all requests in a given
+  iteration, we use a single timestamp on `EngineCoreOutputs` to
+  record this event.
+
+And the calculated intervals are:
+
+- Queue interval - between `QUEUED` and most recent `SCHEDULED`.
+- Prefill interval - between most recent `SCHEDULED` and the subsequent
+  first `NEW_TOKENS`.
+- Decode interval - between first (after the most recent `SCHEDULED`) and
+  last `NEW_TOKENS`.
+- Inference interval - between most recent `SCHEDULED` and last `NEW_TOKENS`.
+- Inter-token interval - between successive `NEW_TOKENS`.
+
+Put another way:
+
+:::{image} /assets/design/v1/metrics/intervals-1.png
+:alt: Interval calculations - common case
+:::
+
+We explored the possibility of having the frontend calculate these
+intervals using the timing of events visible by the frontend. However,
+the frontend does not have visibility into the timing of the `QUEUED`
+and `SCHEDULED` events and, since we need to calculate intervals based
+on monotonic timestamps from the same process ... we need the engine
+core to record timestamps for all of these events.
+
+#### Interval Calculations vs Preemptions
+
+When a preemption occurs during decode, since any already generated
+tokens are reused, we consider the preemption as affecting the
+inter-token, decode, and inference intervals.
+
+:::{image} /assets/design/v1/metrics/intervals-2.png
+:alt: Interval calculations - preempted decode
+:::
+
+When a preemption occurs during prefill (assuming such an event
+is possible), we consider the preemption as affecting the
+time-to-first-token and prefill intervals.
+
+:::{image} /assets/design/v1/metrics/intervals-3.png
+:alt: Interval calculations - preempted prefill
+:::
+
+### Frontend Stats Collection
+
+As the frontend processes a single `EngineCoreOutputs` - i.e. the
+output from a single engine core iteration - it collects various
+statistics relating to that iteration:
+
+- The total number of new tokens generated in this iteration.
+- The total number of prompt tokens processed by the prefills that
+  completed in this iteration.
+- The queue intervals for any requests that were scheduled in this
+  iteration.
+- The prefill intervals for any requests that completed prefill in
+  this iteration.
+- The inter-token intervals (Time Per Output Token, TPOT), for all
+  requests included in this iteration.
+- The Time-To-First-Token (TTFT) for any requests that completed
+  prefill in this iteration. However, we calculate this interval
+  relative to when the request was first received by the frontend
+  (`arrival_time`) in order to account for input processing time.
+
+For any requests that were completed in a given iteration, we also
+record:
+
+- The inference and decode intervals - relative to the scheduled and
+  first token events, as described above.
+- End-to-end latency - the interval between frontend `arrival_time`
+  and the frontend receiving the final token.
+
+### Metrics Publishing - Logging
+
+The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
+every 5 seconds with some key metrics:
+
+- The current number of running/waiting requests
+- The current GPU cache usage
+- The number of prompt tokens processed per second over the past 5
+  seconds
+- The number of new tokens generated per second over the past 5
+  seconds
+- The prefix cache hit rate over the most recent 1k kv-cache block queries
+
+### Metrics Publishing - Prometheus
+
+The `PrometheusStatLogger` metrics publisher makes the metrics
+available via a `/metrics` HTTP endpoint in a Prometheus-compatible
+format. A Prometheus instance can then be configured to poll this
+endpoint (e.g. every second) and record the values in its time-series
+database. Prometheus is often used via Grafana, allowing these metrics
+to be graphed over time.
+
+Prometheus supports the following metric types:
+
+- Counter: a value that will increase over time, never reducing, and
+  generally reset to zero when the vLLM instance restarts. For
+  example, the number of tokens generated over the lifetime of the
+  instance.
+- Gauge: a value that goes up and down, for example the number of
+  requests currently scheduled for execution.
+- Histogram: a count of metric samples, recorded in buckets. For
+  example, the number of requests whose TTFT was <1ms, <5ms, <10ms,
+  <20ms, and so on.
+
+Prometheus metrics can also be labelled, allowing metrics to be
+combined according to matching labels. In vLLM, we add a `model_name`
+label to every metric which includes the name of the model served by
+that instance.
+
+Example output:
+
+```bash
+$ curl http://0.0.0.0:8000/metrics
+# HELP vllm:num_requests_running Number of requests in model execution batches.
+# TYPE vllm:num_requests_running gauge
+vllm:num_requests_running{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
+...
+# HELP vllm:generation_tokens_total Number of generation tokens processed.
+# TYPE vllm:generation_tokens_total counter
+vllm:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 27453.0
+...
+# HELP vllm:request_success_total Count of successfully processed requests.
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B-Instruct"} 131.0
+vllm:request_success_total{finished_reason="abort",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+...
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 13.0
+vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 97.0
+vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 123.0
+vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 138.0
+vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+```
+
+Note - the choice of histogram buckets to be most useful to users
+across a broad set of use cases is not straightforward and will
+require refinement over time.
+
+### Cache Config Info
+
+`prometheus_client` has support for [Info
+metrics](https://prometheus.github.io/client_python/instrumenting/info/)
+which are equivalent to a `Gauge` whose value is permanently set to 1,
+but exposes interesting key/value pair information via labels. This is
+used for information about an instance that does not change - so it
+only needs to be observed at startup - and allows comparing across
+instances in Prometheus.
+
+We use this concept for the `vllm:cache_config_info` metric:
+
+```
+# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
+# TYPE vllm:cache_config_info gauge
+vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
+
+```
+
+However, `prometheus_client` has [never supported Info metrics in
+multiprocessing
+mode](https://github.com/prometheus/client_python/pull/300) - for
+[unclear
+reasons](gh-pr:7279#discussion_r1710417152). We
+simply use a `Gauge` metric set to 1 and
+`multiprocess_mode="mostrecent"` instead.
+
+### LoRA Metrics
+
+The `vllm:lora_requests_info` `Gauge` is somewhat similar, except the
+value is the current wall-clock time, and is updated every iteration.
+
+The label names used are:
+
+- `running_lora_adapters`: a per-adapter count of the number requests
+  running using that adapter, formatted as a comma-separated string.
+- `waiting_lora_adapters`: similar, except counting requests that are
+  waiting to be scheduled.
+- `max_lora` - the static "max number of LoRAs in a single batch."
+  configuration.
+
+Encoding a running/waiting counts for multiple adapters in a
+comma-separated string seems quite misguided - we could use labels to
+distinguish between per-adapter counts. This should be revisited.
+
+Note that `multiprocess_mode="livemostrecent"` is used - the most
+recent metric is used, but only from currently running processes.
+
+This was added in
+<gh-pr:9477> and there is
+[at least one known
+user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). If
+we revisit this design and deprecate the old metric, we should reduce
+the need for a significant deprecation period by making the change in
+v0 also and asking this project to move to the new metric.
+
+### Prefix Cache metrics
+
+The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+some interesting points which may be relevant to how we approach
+future metrics.
+
+Every time the prefix cache is queried, we record the number of blocks
+queried and the number of queried blocks present in the cache
+(i.e. hits).
+
+However, the metric of interest is the hit rate - i.e. the number of
+hits per query.
+
+In the case of logging, we expect the user is best served by
+calculating the hit rate over a fixed number of the most recent
+queries (the interval is fixed to 1k most recent queries for now).
+
+In the case of Prometheus though, we should take advantage of the
+time-series nature of Prometheus and allow the user to calculate the
+hit rate over an interval of their choosing. For example, a PromQL
+query to calculate the hit interval of the past 5 minutes:
+
+```text
+rate(cache_query_hit[5m]) / rate(cache_query_total[5m])
+```
+
+To achieve this, we should record the queries and hits as counters in
+Prometheus, rather than recording the hit rate as a gauge.
+
+## Deprecated Metrics
+
+### How To Deprecate
+
+Deprecating metrics shouldn't be taken lightly. Users may not notice a
+metric has been deprecated, and may be quite inconvenienced when it is
+suddenly (from their perspective) when it is removed, even if there is
+an equivalent metric for them to use.
+
+As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
+[deprecated](gh-pr:2764) (with a
+comment in the code),
+[removed](gh-pr:12383), and then
+[noticed by a
+user](gh-issue:13218).
+
+In general:
+
+1) We should be cautious about deprecating metrics, especially since
+   it can be hard to predict the user impact.
+2) We should include a prominent deprecation notice in the help string
+   that is included in the `/metrics' output.
+3) We should list deprecated metrics in user-facing documentation and
+   release notes.
+4) We should consider hiding deprecated metrics behind a CLI argument
+   in order to give administrators [an escape
+   hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
+   for some time before deleting them.
+
+### Unimplemented - `vllm:tokens_total`
+
+Added by <gh-pr:4464>, but apparently never implemented. This can just be
+removed.
+
+### Duplicated - Queue Time
+
+The `vllm:time_in_queue_requests` Histogram metric was added by
+<gh-pr:9659> and its calculation is:
+
+```
+    self.metrics.first_scheduled_time = now
+    self.metrics.time_in_queue = now - self.metrics.arrival_time
+```
+
+Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+us with:
+
+```
+if seq_group.is_finished():
+    if (seq_group.metrics.first_scheduled_time is not None and
+            seq_group.metrics.first_token_time is not None):
+        time_queue_requests.append(
+            seq_group.metrics.first_scheduled_time -
+            seq_group.metrics.arrival_time)
+    ...
+    if seq_group.metrics.time_in_queue is not None:
+        time_in_queue_requests.append(
+            seq_group.metrics.time_in_queue)
+```
+
+This seems duplicative, and one of them should be removed. The latter
+is used by the Grafana dashboard, so we should deprecate or remove the
+former from v0.
+
+### Prefix Cache Hit Rate
+
+See above - we now expose 'queries' and 'hits' counters rather than a
+'hit rate' gauge.
+
+### KV Cache Offloading
+
+Two v0 metrics relate to a "swapped" preemption mode that is no
+longer relevant in v1:
+
+- `vllm:num_requests_swapped`
+- `vllm:cpu_cache_usage_perc`
+
+In this mode, when a request is preempted (e.g. to make room in KV
+cache to complete other requests), we swap kv cache blocks out to CPU
+memory. This is also known as "KV cache offloading" and is configured
+with `--swap-space` and `--preemption-mode`.
+
+In v0, [VLLM has long supported beam
+search](gh-issue:6226). The
+SequenceGroup encapsulated the idea of N Sequences which
+all shared the same prompt kv blocks. This enabled KV cache block
+sharing between requests, and copy-on-write to do branching. CPU
+swapping was intended for these beam search like cases.
+
+Later, the concept of prefix caching was introduced, which allowed KV
+cache blocks to be shared implicitly. This proved to be a better
+option than CPU swapping since blocks can be evicted slowly on demand
+and the part of the prompt that was evicted can be recomputed.
+
+SequenceGroup was removed in V1, although a replacement will be
+required for "parallel sampling" (`n>1`). [Beam search was moved out of
+the core (in
+V0)](gh-issue:8306). There was a
+lot of complex code for a very uncommon feature.
+
+In V1, with prefix caching being better (zero over head) and therefore
+on by default, the preemption and recompute strategy should work
+better.
+
+## Future Work
+
+### Parallel Sampling
+
+Some v0 metrics are only relevant in the context of "parallel
+sampling". This is where the `n` parameter in a request is used to
+request multiple completions from the same prompt.
+
+As part of adding parallel sampling support in <gh-pr:10980> we should
+also add these metrics.
+
+- `vllm:request_params_n` (Histogram)
+
+Observes the value of the 'n' parameter of every finished request.
+
+- `vllm:request_max_num_generation_tokens` (Histogram)
+
+Observes the maximum output length of all sequences in every finished
+sequence group. In the absence of parallel sampling, this is
+equivalent to `vllm:request_generation_tokens`.
+
+### Speculative Decoding
+
+Some v0 metrics are specific to "speculative decoding". This is where
+we generate candidate tokens using a faster, approximate method or
+model and then validate those tokens with the larger model.
+
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+seculative decoding to v1. Other techniques will follow. We should
+revisit the v0 metrics in this context.
+
+Note - we should probably expose acceptance rate as separate accepted
+and draft counters, like we do for prefix caching hit rate. Efficiency
+likely also needs similar treatment.
+
+### Autoscaling and Load-balancing
+
+A common use case for our metrics is to support automated scaling of
+vLLM instances.
+
+For related discussion from the [Kubernetes Serving Working
+Group](https://github.com/kubernetes/community/tree/master/wg-serving),
+see:
+
+- [Standardizing Large Model Server Metrics in
+  Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
+- [Benchmarking LLM Workloads for Performance Evaluation and
+  Autoscaling in
+  Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
+- [Inference
+  Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
+- <gh-issue:5041> and <gh-pr:12726>.
+  
+This is a non-trivial topic. Consider this comment from Rob:
+
+> I think this metric should focus on trying to estimate what the max
+> concurrency that will cause the average request length > queries per
+> second ... since this is really what will "saturate" the server.
+
+A clear goal is that we should expose the metrics required to detect
+this saturation point, so administrators can implement auto-scaling
+rules based on those. However, in order to do so, we need to have a
+clear view on how an administrator (and automated monitoring system)
+should judge an instance as approaching saturation:
+
+> To identify, what is the saturation point for model server compute
+> (the inflection point where we cannot get more throughput with a
+> higher request rate, but start to incur additional latency) so we
+> can autoscale effectively?
+
+### Metric Naming
+
+Our approach to naming metrics probably deserves to be revisited:
+
+1. The use of colons in metric names seems contrary to ["colons are
+   reserved for user defined recording
+   rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels)
+2. Most of our metrics follow the convention of ending with units, but
+   not all do.
+3. Some of our metric names end with `_total`:
+
+```
+If there is a suffix of `_total` on the metric name, it will be removed. When
+exposing the time series for counter, a `_total` suffix will be added. This is
+for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics
+requires the `_total` suffix.
+```
+
+### Adding More Metrics
+
+There is no shortage of ideas for new metrics:
+
+- Examples from other projects like
+  [TGI](https://github.com/IBM/text-generation-inference?tab=readme-ov-file#metrics)
+- Proposals arising from specific use cases, like the Kubernetes
+  auto-scaling topic above
+- Proposals that might arise out of standardisation efforts like
+  [OpenTelemetry Semantic Conventions for Gen
+  AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai).
+
+We should be cautious in our approach to adding new metrics. While
+metrics are often relatively straightforward to add:
+
+1. They can be difficult to remove - see the section on deprecation
+   above.
+2. They can have a meaningful performance impact when enabled. And
+   metrics are usually of very limited use unless they can be enabled
+   by default and in production.
+3. They have an impact on development and maintenance of the
+   project. Every metric added to v0 has made this v1 effort more
+   time-consuming, and perhaps not all metrics justify this ongoing
+   investment in their maintenance.
+
+## Tracing - OpenTelemetry
+
+Metrics provide an aggregated view over time of the system's
+performance and health. Tracing, on the other hand, tracks individual
+requests as they move through different services and components. Both
+fall under the more general heading of "Observability".
+
+v0 has support for OpenTelemetry tracing:
+
+- Added by <gh-pr:4687>
+- Configured with `--oltp-traces-endpoint` and
+  `--collect-detailed-traces`
+- [OpenTelemetry blog
+  post](https://opentelemetry.io/blog/2024/llm-observability/)
+- [User-facing
+  docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html)
+- [Blog
+  post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
+- [IBM product
+  docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
+  
+OpenTelemetry has a [Gen AI Working
+Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
+
+Since metrics is a big enough topic on its own, we are going to tackle
+the topic of tracing in v1 separately.
+
+### OpenTelemetry Model Forward vs Execute Time
+
+In v0, we have the following two metrics:
+
+- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
+  in the model forward pass when this request was in the batch.
+- `vllm:model_execute_time_milliseconds` (Histogram) - The time spent
+  in the model execute function. This will include model forward,
+  block/sync across workers, cpu-gpu sync time and sampling time.
+
+These metrics are only enabled when OpenTelemetry tracing is enabled
+and if `--collect-detailed-traces=all/model/worker` is used. The
+documentation for this option states:
+
+> collect detailed traces for the specified "modules. This involves
+> use of possibly costly and or blocking operations and hence might
+> have a performance impact.
+
+The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+as:
+
+```
+-> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
+-> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
+-> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
+```
+
+We already have `inference_time` and `decode_time` metrics, so the
+question is whether there are sufficiently common use cases for the
+higher-resolution timings to justify the overhead.
+
+Since we are going to treat the question of OpenTelemetry support
+separately, we will include these particular metrics under that topic.
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index dc8432baef9d..2fae22cc264e 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -183,7 +183,7 @@ When a request is finished, we free all its blocks if no other requests are usin
 
 When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
 
-1. Pop the block from the head of the free queue. This is the LRU black to be evicted.  
+1. Pop the block from the head of the free queue. This is the LRU block to be evicted.  
 2. Remove the block ID from the Cache Block.  
 3. Remove the block hash.
 
@@ -221,7 +221,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 6
 :::
 
-**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
 :::{image} /assets/design/v1/prefix_caching/example-time-7.png
 :alt: Example Time 7
diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
new file mode 100644
index 000000000000..0dadc8089991
--- /dev/null
+++ b/docs/source/design/v1/torch_compile.md
@@ -0,0 +1,139 @@
+# vLLM's `torch.compile` integration
+
+In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
+
+Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+
+## Compilation Cache
+
+In the very verbose logs, we can see:
+
+```
+INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
+```
+
+vLLM will take all the available factors into consideration, and decide a directory to store all the compilation artifact. This means, you can directly copy the whole `~/.cache/vllm/torch_compile_cache` directory in your deployment scenario to save a great amount of compilation time, and hence accelerating the starting time of the vLLM instance.
+
+The factors considered include:
+
+- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- The model's forward function and the relevant functions called by the forward function (see below)
+
+With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
+
+A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
+
+## Python Code Compilation
+
+In the very verbose logs, we can see:
+
+```
+DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
+
+DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
+
+DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
+DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
+```
+
+This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
+
+The result of the Dynamo compilation, is a new function stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py`. Usually, this function unpacks tensors from the module, and then pass it to the traced computation graph. The computation graph is stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py`.
+
+## Computation Graph Processing
+
+The computation graph has shape annotations for every tensor. The inputs are input ids, position ids, weights and buffers from the model, and the outputs are the final hidden states. Note that lm head projection and sampling operations are not considered in the graph.
+
+Most of the inputs to the computation graph has static shape, since they are model weights and buffers, and will not change during the lifetime of the model. Only the input ids and position ids have symbolic shapes, i.e. the shape can change from batch to batch. However, they will share the same symbolic shapes. That is to say, the only changing size to the computation graph, is the batch size (number of tokens processed in the current forward pass).
+
+The attention operation is complicated, and it needs to interact with kv caches, with complicated shapes. Fortunately, the output of the attention operation just share the same shape as the input query of the attention operation. Therefore, we wrap the whole attention operation into a PyTorch custom op `torch.ops.vllm.unified_attention_with_output`, so that Dynamo will not try to inspect any of the internal operations. This way, although attention operation is complicated, we can still capture the model's computation graph as a full-graph, from Dynamo's perspective.
+
+The computation graph is further split into pieces, by the `splitting_ops` (usually this is the attention operation). Therefore, in the `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py` file, we can see lots of submodules, each submodule is a piece of graph after splitting:
+
+- Attention operation itself is a submodule.
+- The part of computation graph, from one attention operation to the next attention operation, is a submodule.
+
+Every submodule can be identified by its index, and will be processed individually.
+
+## Computation Graph Compilation
+
+In the very verbose logs, we can also see:
+
+```
+DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+...
+DEBUG 03-07 03:52:45 [backends.py:134] store the 15-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+DEBUG 03-07 03:52:45 [backends.py:134] store the 16-th graph for shape None from inductor via handle ('fvj3ccoi7m34f3dnr4itmu55mmun44l5xymwhrjlwisylsk7q6jy', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/tf/ctfftkglj7b4lcttq5cymx6cew372uoauupqn6ldsvpiucavqcjc.py')
+```
+
+This means the first piece of computation graph (with shape `None` for symbolic shape) is compiled by Inductor (with a key `fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw`). The compiled kernel is stored in  `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py`. You can open the file to see what is the code Inductor finally runs.
+
+One more detail: you can see that the 1-th graph and the 15-th graph have the same key, while the 0-th graph and the 16-th graph are different. This is expected, since we split the graph by the attention op, we get 3 unique subgraphs:
+
+- the first layer before attention
+- every middle layer, from one attention operation to the next attention operation
+- the final layer after attention
+
+If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
+
+```
+DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+```
+
+This time, Inductor compilation is completely bypassed, and we will load from disk to read the compilation artifact we get from the last time.
+
+The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
+
+`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+
+Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
+
+When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
+
+```
+AUTOTUNE mm(8x2048, 2048x3072)
+  triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+  triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+  mm 0.0160 ms 81.6% 
+  triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+  triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+  triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+  triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+  triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
+```
+
+It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
+
+Unfortunately, because auto-tuning takes quite a long time (from seconds to minutes, depending on the model size and the batch size), even though it can be cached for later use, for the sake of user-friendliness, we turn it off by default. If you want to have max performance, it is recommended to try it, by compiling specific shapes.
+
+## Cudagraph Capture
+
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+
+The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
+
+The cudagraphs are captured and managed by the compiler backend, and replayed when the batch size has corresponding cudagraph captured. The caller of the model (model runner) only needs to make sure it manages the input buffers correctly. All of the intermediate buffers are managed automatically by the compiler backend.
+
+By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
+
+`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+
+Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 7001ec91467f..b703d0195319 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -3,7 +3,7 @@
 # AutoAWQ
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
-Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
+Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
 
 You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md
new file mode 100644
index 000000000000..34adf6512b7e
--- /dev/null
+++ b/docs/source/features/quantization/gptqmodel.md
@@ -0,0 +1,83 @@
+(gptqmodel)=
+
+# GPTQModel
+
+To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
+
+Quantization reduces the model's precision from BF16/FP16 (16-bits) to INT4 (4-bits) or INT8 (8-bits) which significantly reduces the
+total model memory footprint while at-the-same-time increasing inference performance.
+
+Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching
+transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs.
+These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow world-class inference performance of quantized GPTQ
+models.
+
+GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization
+is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
+for more details on this and other advanced features.
+
+You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
+
+```console
+pip install -U gptqmodel --no-build-isolation -v
+```
+
+After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
+
+Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
+
+```python
+from datasets import load_dataset
+from gptqmodel import GPTQModel, QuantizeConfig
+
+model_id = "meta-llama/Llama-3.2-1B-Instruct"
+quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+
+calibration_dataset = load_dataset(
+    "allenai/c4",
+    data_files="en/c4-train.00001-of-01024.json.gz",
+    split="train"
+  ).select(range(1024))["text"]
+
+quant_config = QuantizeConfig(bits=4, group_size=128)
+
+model = GPTQModel.load(model_id, quant_config)
+
+# increase `batch_size` to match gpu/vram specs to speed up quantization
+model.quantize(calibration_dataset, batch_size=2)
+
+model.save(quant_path)
+```
+
+To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
+
+```console
+python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+```
+
+GPTQModel quantized models are also supported directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+
+# Create an LLM.
+llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 1c98620aa214..65f438f599f1 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -12,6 +12,7 @@ supported_hardware
 auto_awq
 bnb
 gguf
+gptqmodel
 int4
 int8
 fp8
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 230e461f69f4..b5fad26368bd 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -10,7 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
 
 vLLM currently supports the following reasoning models:
 
-- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
+| Model Series | Parser Name | Structured Output Support |
+|--------------|-------------|------------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
 
 ## Quickstart
 
@@ -76,13 +79,101 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 }
 ```
 
-Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+stream = client.chat.completions.create(model=model,
+                                        messages=messages,
+                                        stream=True)
+
+print("client: Start streaming chat completions...")
+printed_reasoning_content = False
+printed_content = False
+
+for chunk in stream:
+    reasoning_content = None
+    content = None
+    # Check the content is reasoning_content or content
+    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif hasattr(chunk.choices[0].delta, "content"):
+        content = chunk.choices[0].delta.content
+
+    if reasoning_content is not None:
+        if not printed_reasoning_content:
+            printed_reasoning_content = True
+            print("reasoning_content:", end="", flush=True)
+        print(reasoning_content, end="", flush=True)
+    elif content is not None:
+        if not printed_content:
+            printed_content = True
+            print("\ncontent:", end="", flush=True)
+        # Extract and print the content
+        print(content, end="", flush=True)
+```
+
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+
+## Structured output
+
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
+
+```python
+from openai import OpenAI
+from pydantic import BaseModel
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+json_schema = People.model_json_schema()
+
+prompt = ("Generate a JSON with the name and age of one random person.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+```
 
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
 - It is not compatible with [`tool_calling`](#tool_calling).
-- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
 
 ## How to support a new reasoning model
 
@@ -166,9 +257,10 @@ class DeepSeekReasoner(Reasoner):
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.end_token_id in input_ids
+    ...
 ```
 
-The structured output engine like xgrammar will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
+The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
 Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
 
diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 85a9e0373986..2e1081bf8d14 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -209,6 +209,15 @@ AI21's Jamba-1.5 models are supported.
 
 Flags: `--tool-call-parser jamba`
 
+### Qwen Models
+
+For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
+
+* `Qwen/Qwen2.5-*`
+* `Qwen/QwQ-32B`
+
+Flags: `--tool-call-parser hermes`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index c51ca18667ef..1206d5fe7539 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -14,13 +14,14 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
-        "Cli": "CLI",
+        "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
         "aqlm": "AQLM",
         "gguf": "GGUF",
         "lora": "LoRA",
+        "rlhf": "RLHF",
         "vllm": "vLLM",
         "openai": "OpenAI",
         "multilora": "MultiLoRA",
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index f3b0d6dc9bdc..7e52f6048909 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -63,7 +63,7 @@ To build and install vLLM from source, run:
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-hpu.txt
+pip install -r requirements/hpu.txt
 python setup.py develop
 ```
 
@@ -73,7 +73,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork
 git checkout habana_main
-pip install -r requirements-hpu.txt
+pip install -r requirements/hpu.txt
 python setup.py develop
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index f149818acafb..4c668a8e6892 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -116,7 +116,7 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -U -r requirements-neuron.txt
+pip install -U -r requirements/neuron.txt
 VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 4f25252d9daf..5641c1563656 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -32,7 +32,7 @@ Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend insta
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 Finally, install vLLM with OpenVINO backend:
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index c0d50feafce5..6c7bbf602499 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -151,7 +151,7 @@ pip uninstall torch torch-xla -y
 Install build dependencies:
 
 ```bash
-pip install -r requirements-tpu.txt
+pip install -r requirements/tpu.txt
 sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 3bf1d47fa0ff..7bc9e85ecd96 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -25,7 +25,7 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-cpu.txt
+pip install -r requirements/cpu.txt
 pip install -e . 
 ```
 
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 46329e9bd281..39d9dfbd2b2e 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -18,7 +18,7 @@ Third, install Python packages for vLLM CPU backend building:
 ```console
 pip install --upgrade pip
 pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
-pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 Finally, build and install vLLM CPU backend:
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 2477c3e4c93f..7e3b884c2ab1 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -148,7 +148,7 @@ To build vLLM using an existing PyTorch installation:
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
-pip install -r requirements-build.txt
+pip install -r requirements/build.txt
 pip install -e . --no-build-isolation
 ```
 
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 84e7f6507de8..4381cef5e96a 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -84,7 +84,7 @@ Currently, there are no pre-built ROCm wheels.
     # Install dependencies
     $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm
     $ pip install "numpy<2"
-    $ pip install -r requirements-rocm.txt
+    $ pip install -r requirements/rocm.txt
 
     # Build vLLM for MI210/MI250/MI300.
     $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index ef02d9a078a1..9678c25b1dd8 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -25,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 ```console
 source /opt/intel/oneapi/setvars.sh
 pip install --upgrade pip
-pip install -v -r requirements-xpu.txt
+pip install -v -r requirements/xpu.txt
 ```
 
 - Finally, build and install vLLM XPU backend:
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 92103e65bbbb..fdfaf9f93269 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -254,6 +254,10 @@ ValueError: Model architectures ['<arch>'] are not supported for now. Supported
 
 But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model.
 
+## Failed to infer device type
+
+If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
diff --git a/docs/source/index.md b/docs/source/index.md
index d17155647f9f..09ada43335c7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -100,6 +100,15 @@ features/compatibility_matrix
 
 % Details about running vLLM
 
+:::{toctree}
+:caption: Training
+:maxdepth: 1
+
+training/trl.md
+training/rlhf.md
+
+:::
+
 :::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@@ -157,7 +166,9 @@ design/multiprocessing
 :caption: V1 Design Documents
 :maxdepth: 2
 
+design/v1/torch_compile
 design/v1/prefix_caching
+design/v1/metrics
 :::
 
 % How to contribute to the vLLM project
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index f31e5715d175..06daa04f2dea 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -54,14 +54,16 @@ The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggi
 For example, to search using 5 beams and output at most 50 tokens:
 
 ```python
+from vllm import LLM
+from vllm.sampling_params import BeamSearchParams
+
 llm = LLM(model="facebook/opt-125m")
 params = BeamSearchParams(beam_width=5, max_tokens=50)
-outputs = llm.generate("Hello, my name is", params)
+outputs = llm.beam_search([{"prompt": "Hello, my name is "}], params)
 
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    generated_text = output.sequences[0].text
+    print(f"Generated text: {generated_text!r}")
 ```
 
 ### `LLM.chat`
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 29ed24cfdb5c..c9140bd06e89 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -59,6 +59,10 @@ llm.apply_model(lambda model: print(type(model)))
 
 If it is `TransformersModel` then it means it's based on Transformers!
 
+:::{tip}
+You can force the use of `TransformersModel` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
+:::
+
 :::{note}
 vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 :::
@@ -298,6 +302,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
   * ✅︎
   * ✅︎
+- * `GraniteMoeSharedForCausalLM`
+  * Granite MoE Shared
+  * `ibm-research/moe-7b-1b-active-shared-experts` (test model)
+  * ✅︎
+  * ✅︎
 - * `GritLM`
   * GritLM
   * `parasail-ai/GritLM-7B-vllm`.
@@ -405,7 +414,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Phi3ForCausalLM`
   * Phi-4, Phi-3
-  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
   * ✅︎
   * ✅︎
 - * `Phi3SmallForCausalLM`
@@ -837,13 +846,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `PaliGemmaForConditionalGeneration`\*
-  * PaliGemma, PaliGemma 2
+- * `PaliGemmaForConditionalGeneration`
+  * PaliGemma ⚠️, PaliGemma 2 ⚠️
   * T + I<sup>E</sup>
   * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
   *
   * ✅︎
-  *
+  * ✅︎
 - * `Phi3VForCausalLM`
   * Phi-3-Vision, Phi-3.5-Vision
   * T + I<sup>E+</sup>
@@ -851,10 +860,17 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Phi4MMForCausalLM`
+  * Phi-4-multimodal
+  * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
+  * `microsoft/Phi-4-multimodal-instruct`, etc.
+  * ✅︎
+  *
+  *
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
   *
   * ✅︎
   * ✅︎
@@ -901,6 +917,12 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
+:::{warning}
+vLLM does not currently support PrefixLM attention mask, so our PaliGemma implementation uses regular causal attention, which causes the model output to be unstable.
+
+We may deprecate this model series in a future release.
+:::
+
 :::{note}
 `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
 :::
@@ -914,14 +936,6 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 
-:::{note}
-Currently the PaliGemma model series is implemented without PrefixLM attention mask. This model series may be deprecated in a future release.
-:::
-
-:::{note}
-`mistral-community/pixtral-12b` does not support V1 yet.
-:::
-
 :::{note}
 To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 54c7ded20421..e6be644b7393 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -81,7 +81,7 @@ Then you get a ray cluster of **containers**. Note that you need to keep the she
 Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
 :::
 
-Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
 After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
@@ -111,5 +111,5 @@ When you use huggingface repo id to refer to the model, you should append your h
 :::
 
 :::{warning}
-If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information.
 :::
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index c540bff2cf30..2e2016c95e4f 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -462,4 +462,69 @@ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 
 ### Embedding Inputs
 
-TBD
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+#### Image Embedding Inputs
+For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
+The following example demonstrates how to pass image embeddings to the OpenAI server:
+
+```python
+image_embedding = torch.load(...)
+grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+
+buffer = io.BytesIO()
+torch.save(image_embedding, buffer)
+buffer.seek(0)
+binary_data = buffer.read()
+base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Basic usage - this is equivalent to the LLaVA example for offline inference
+model = "llava-hf/llava-1.5-7b-hf"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": f"{base64_image_embedding}" 
+}
+
+# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+model = "Qwen/Qwen2-VL-2B-Instruct"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": {
+        "image_embeds": f"{base64_image_embedding}" , # Required
+        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+    },
+}
+model = "openbmb/MiniCPM-V-2_6"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": {
+        "image_embeds": f"{base64_image_embedding}" , # Required
+        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+    },
+}
+chat_completion = client.chat.completions.create(
+    messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {
+            "type": "text",
+            "text": "What's in this image?",
+        },
+        embeds,
+        ],
+    },
+],
+    model=model,
+)
+```
+
+:::{note}
+Only one message can contain `{"type": "image_embeds"}`.
+If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
+:::
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 5ab46da90ea6..0880a4530d8c 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -379,6 +379,10 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
+:::{note}
+To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
+:::
+
 <!-- TODO: api enforced limits + uploading audios -->
 
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
diff --git a/docs/source/training/rlhf.md b/docs/source/training/rlhf.md
new file mode 100644
index 000000000000..00822aefe11e
--- /dev/null
+++ b/docs/source/training/rlhf.md
@@ -0,0 +1,11 @@
+# Reinforcement Learning from Human Feedback
+
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+
+vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+
+See the following basic examples to get started if you don't want to use an existing library:
+
+- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html)
+- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html)
+- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html)
diff --git a/docs/source/training/trl.md b/docs/source/training/trl.md
new file mode 100644
index 000000000000..ebdf593dbde5
--- /dev/null
+++ b/docs/source/training/trl.md
@@ -0,0 +1,13 @@
+# Transformers Reinforcement Learning
+
+Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
+
+Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
+
+See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
+
+:::{seealso}
+For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
+- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
+- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
+:::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 1ceec026b319..293b9fddac89 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -6,10 +6,14 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import os
+
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
@@ -51,6 +55,40 @@ def run_minicpmo(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: str, audio_count: int):
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process audio inputs.
+    """
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    speech_lora_path = os.path.join(model_path, "speech-lora")
+    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
+
+    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+    lora_request = LoRARequest("speech", 1, speech_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
+
+
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int):
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -113,6 +151,7 @@ def run_whisper(question: str, audio_count: int):
 
 model_example_map = {
     "minicpmo": run_minicpmo,
+    "phi4_mm": run_phi4mm,
     "qwen2_audio": run_qwen2_audio,
     "ultravox": run_ultravox,
     "whisper": run_whisper,
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 2e1fa50e2ab3..b00519314d8b 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-# usage: VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
+# usage:
+# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
 # we need to have a launcher to create multiple data parallel
 # ranks. And each rank will create a vLLM instance to process its own prompts.
 import os
@@ -7,6 +8,9 @@
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
+GPUs_per_dp_rank = 2
+DP_size = 2
+
 
 def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     os.environ["VLLM_DP_RANK"] = str(dp_rank)
@@ -48,9 +52,10 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
                                      max_tokens=16 * (dp_rank + 1))
 
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m",
-              tensor_parallel_size=2,
-              enforce_eager=True)
+    llm = LLM(model="ibm-research/PowerMoE-3b",
+              tensor_parallel_size=GPUs_per_dp_rank,
+              enforce_eager=True,
+              enable_expert_parallel=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for output in outputs:
@@ -62,14 +67,12 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
 
 if __name__ == "__main__":
     from multiprocessing import Process
-    dp_size = 2
-    GPUs_per_dp_rank = 2
     dp_master_ip = "127.0.0.1"
     dp_master_port = get_open_port()
     procs = []
-    for i in range(dp_size):
+    for i in range(DP_size):
         proc = Process(target=main,
-                       args=(dp_size, i, dp_master_ip, dp_master_port,
+                       args=(DP_size, i, dp_master_ip, dp_master_port,
                              GPUs_per_dp_rank))
         proc.start()
         procs.append(proc)
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index f7741a372243..e94f47b72b2e 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -15,7 +15,6 @@ def create_test_prompts() -> list[tuple[str, SamplingParams]]:
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
          SamplingParams(n=2,
-                        best_of=5,
                         temperature=0.8,
                         top_p=0.95,
                         frequency_penalty=0.1)),
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 172d18cbce2f..b0418c092ca3 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -18,72 +18,11 @@
 import torch
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
 from transformers import AutoModelForCausalLM
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def stateless_init_process_group(master_address, master_port, rank, world_size,
-                                 device):
-    """
-    vLLM provides `StatelessProcessGroup` to create a process group
-    without considering the global process group in torch.distributed.
-    It is recommended to create `StatelessProcessGroup`, and then initialize
-    the data-plane communication (NCCL) between external (train processes) 
-    and vLLM workers.
-    """
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    from vllm.distributed.utils import StatelessProcessGroup
-    pg = StatelessProcessGroup.create(host=master_address,
-                                      port=master_port,
-                                      rank=rank,
-                                      world_size=world_size)
-    pynccl = PyNcclCommunicator(pg, device=device)
-    return pynccl
-
-
-class MyWorker(Worker):
-    """
-    The `MyWorker` class inherits from `Worker` to provide custom functions.
-    For simplicity, we define the `MyWorker` class in this self-contained 
-    script. Normally, we should define the `MyWorker` class in a separate 
-    file and pass the qualified name of the class to the `worker_cls` 
-    parameter.
-    """
-
-    def init_weight_update_group(self, master_address, master_port,
-                                 rank_offset, world_size):
-        from vllm.distributed.parallel_state import get_world_group
-        rank = get_world_group().rank + rank_offset
-        self.model_update_group = stateless_init_process_group(
-            master_address,
-            master_port,
-            rank,
-            world_size,
-            self.device,
-        )
-
-    def update_weight(self, name, dtype, shape):
-        weight = torch.empty(shape, dtype=dtype, device="cuda")
-        self.model_update_group.broadcast(weight,
-                                          src=0,
-                                          stream=torch.cuda.current_stream())
-
-        self.model_runner.model.load_weights(weights=[(name, weight)])
-
-        del weight
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated
 
 
 class MyLLM(LLM):
@@ -129,7 +68,7 @@ def __init__(self, *args, **kwargs):
 )(MyLLM).remote(
     model="facebook/opt-125m",
     enforce_eager=True,
-    worker_cls=MyWorker,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
     tensor_parallel_size=2,
     distributed_executor_backend="ray",
 )
@@ -159,6 +98,7 @@ def __init__(self, *args, **kwargs):
 
 handle = llm.collective_rpc.remote("init_weight_update_group",
                                    args=(master_address, master_port, 1, 3))
+
 model_update_group = stateless_init_process_group(master_address, master_port,
                                                   0, 3, torch.device("cuda:0"))
 ray.get(handle)
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 15dc7edc18ad..3ceac0fa2e20 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -17,40 +17,6 @@
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 from vllm import LLM
-from vllm.worker.worker import Worker
-
-
-class MyWorker(Worker):
-
-    def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
-        self.device_uuid = current_platform.get_device_uuid(self.device.index)
-        return self.device_uuid
-
-    def update_weights_from_ipc_handles(self, ipc_handles):
-        handles = ipc_handles[self.device_uuid]
-        device_id = self.device.index
-        weights = []
-        for name, handle in handles.items():
-            func, args = handle
-            list_args = list(args)
-            # the key is to change device id to the current device id
-            # in case two processes have different CUDA_VISIBLE_DEVICES
-            list_args[6] = device_id
-            tensor = func(*list_args)
-            weights.append((name, tensor))
-        self.model_runner.model.load_weights(weights=weights)
-        torch.cuda.synchronize()
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated
 
 
 class MyLLM(LLM):
@@ -150,7 +116,7 @@ def get_weight_ipc_handles(self):
     )(MyLLM).remote(
         model="facebook/opt-125m",
         enforce_eager=True,
-        worker_cls=MyWorker,
+        worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
         tensor_parallel_size=2,
         distributed_executor_backend="ray",
         gpu_memory_utilization=0.4,
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
new file mode 100644
index 000000000000..11b73b7c4a0a
--- /dev/null
+++ b/examples/offline_inference/rlhf_utils.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size,
+                                 device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes) 
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+    pg = StatelessProcessGroup.create(host=master_address,
+                                      port=master_port,
+                                      rank=rank,
+                                      world_size=world_size)
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class WorkerExtension:
+    """
+    The class for vLLM's worker to inherit from.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def init_weight_update_group(self, master_address, master_port,
+                                 rank_offset, world_size):
+        from vllm.distributed.parallel_state import get_world_group
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(weight,
+                                          src=0,
+                                          stream=torch.cuda.current_stream())
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
+
+
+class ColocateWorkerExtension:
+    """
+    The class for vLLM's worker to inherit from, in the colocate setting.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index bd0e984627d1..4a8f17ba1d0d 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -21,7 +21,9 @@
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
 # In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="google/gemma-2b", enforce_eager=True)
+llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+          max_num_batched_tokens=64,
+          max_num_seqs=4)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):
     prompt = output.prompt
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index e2ec36211b86..716c31b96ed1 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -6,13 +6,16 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import os
 import random
 
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -21,7 +24,7 @@
 
 
 # Aria
-def run_aria(question: str, modality: str):
+def run_aria(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
@@ -32,41 +35,42 @@ def run_aria(question: str, modality: str):
               dtype="bfloat16",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
+    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+                "<|im_end|>\n<|im_start|>assistant\n")
+               for question in questions]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # BLIP-2
-def run_blip2(question: str, modality: str):
+def run_blip2(questions: list[str], modality: str):
     assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
+    prompts = [f"Question: {question} Answer:" for question in questions]
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Chameleon
-def run_chameleon(question: str, modality: str):
+def run_chameleon(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}<image>"
+    prompts = [f"{question}<image>" for question in questions]
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Deepseek-VL2
-def run_deepseek_vl2(question: str, modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -77,9 +81,12 @@ def run_deepseek_vl2(question: str, modality: str):
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
 
-    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+    prompts = [
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Florence2
@@ -99,20 +106,20 @@ def run_florence2(question: str, modality: str):
 
 
 # Fuyu
-def run_fuyu(question: str, modality: str):
+def run_fuyu(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}\n"
+    prompts = [f"{question}\n" for question in questions]
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # GLM-4v
-def run_glm4v(question: str, modality: str):
+def run_glm4v(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "THUDM/glm-4v-9b"
 
@@ -124,15 +131,17 @@ def run_glm4v(question: str, modality: str):
               hf_overrides={"architectures": ["GLM4VForCausalLM"]},
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>"
+    prompts = [
+        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>" for question in questions
+    ]
 
     stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+def run_h2ovl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "h2oai/h2ovl-mississippi-800m"
@@ -146,19 +155,22 @@ def run_h2ovl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
+def run_idefics3(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
@@ -176,15 +188,15 @@ def run_idefics3(question: str, modality: str):
         },
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    prompt = (
+    prompts = [(
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
+    ) for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # InternVL
-def run_internvl(question: str, modality: str):
+def run_internvl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "OpenGVLab/InternVL2-2B"
@@ -198,10 +210,13 @@ def run_internvl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -209,71 +224,82 @@ def run_internvl(question: str, modality: str):
     # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.5
-def run_llava(question: str, modality: str):
+def run_llava(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    prompts = [
+        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
+    ]
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question: str, modality: str):
+def run_llava_next(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"[INST] <image>\n{question} [/INST]"
+    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question: str, modality: str):
+def run_llava_next_video(questions: list[str], modality: str):
     assert modality == "video"
 
-    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    prompts = [
+        f"USER: <video>\n{question} ASSISTANT:" for question in questions
+    ]
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-OneVision
-def run_llava_onevision(question: str, modality: str):
+def run_llava_onevision(questions: list[str], modality: str):
 
     if modality == "video":
-        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]
 
     elif modality == "image":
-        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
               max_model_len=16384,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Mantis
-def run_mantis(question: str, modality: str):
+def run_mantis(questions: list[str], modality: str):
     assert modality == "image"
 
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
+    prompts = [
+        llama3_template.format(f"{question}\n<image>")
+        for question in questions
+    ]
 
     llm = LLM(
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
@@ -282,11 +308,11 @@ def run_mantis(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # MiniCPM-V
-def run_minicpmv_base(question: str, modality: str, model_name):
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
     assert modality in ["image", "video"]
     # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
@@ -333,26 +359,28 @@ def run_minicpmv_base(question: str, modality: str, model_name):
         "video": "(<video>./</video>)",
     }
 
-    messages = [{
-        'role': 'user',
-        'content': f'{modality_placeholder[modality]}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+    return llm, prompts, stop_token_ids
 
 
-def run_minicpmo(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+def run_minicpmo(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
 
 
-def run_minicpmv(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+def run_minicpmv(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
 # LLama 3.2
-def run_mllama(question: str, modality: str):
+def run_mllama(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -370,7 +398,7 @@ def run_mllama(question: str, modality: str):
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
+    messages = [[{
         "role":
         "user",
         "content": [{
@@ -379,16 +407,16 @@ def run_mllama(question: str, modality: str):
             "type": "text",
             "text": f"{question}"
         }]
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           add_generation_prompt=True,
-                                           tokenize=False)
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Molmo
-def run_molmo(question, modality):
+def run_molmo(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "allenai/Molmo-7B-D-0924"
@@ -400,13 +428,16 @@ def run_molmo(question, modality):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = question
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # NVLM-D
-def run_nvlm_d(question: str, modality: str):
+def run_nvlm_d(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "nvidia/NVLM-D-72B"
@@ -422,12 +453,15 @@ def run_nvlm_d(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # PaliGemma
@@ -435,7 +469,7 @@ def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma-3b-mix-224",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -447,7 +481,7 @@ def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma2-3b-ft-docci-448",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -455,10 +489,13 @@ def run_paligemma2(question: str, modality: str):
 
 
 # Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+def run_phi3v(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    prompts = [
+        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]
 
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -482,11 +519,45 @@ def run_phi3v(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
+
+
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: list[str], modality: str):
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
+        for question in questions
+    ]
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+    )
+    lora_request = LoRARequest("vision", 1, vision_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
 
 
 # Pixtral HF-format
-def run_pixtral_hf(question: str, modality: str):
+def run_pixtral_hf(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "mistral-community/pixtral-12b"
@@ -499,13 +570,13 @@ def run_pixtral_hf(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen
-def run_qwen_vl(question: str, modality: str):
+def run_qwen_vl(questions: list[str], modality: str):
     assert modality == "image"
 
     llm = LLM(
@@ -517,13 +588,13 @@ def run_qwen_vl(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"{question}Picture 1: <img></img>\n"
+    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+def run_qwen2_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -544,16 +615,18 @@ def run_qwen2_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2.5-VL
-def run_qwen2_5_vl(question: str, modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
@@ -574,12 +647,14 @@ def run_qwen2_5_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 model_example_map = {
@@ -606,6 +681,7 @@ def run_qwen2_5_vl(question: str, modality: str):
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
+    "phi4_mm": run_phi4mm,
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
@@ -624,29 +700,35 @@ def get_multi_modal_input(args):
         # Input image and question
         image = ImageAsset("cherry_blossom") \
             .pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
     """Repeats images with provided probability of "image_repeat_prob". 
     Used to simulate hit/miss for the MM preprocessor cache.
     """
@@ -666,7 +748,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
                 cur_image.putpixel((0, 0), new_val)
 
         inputs.append({
-            "prompt": prompt,
+            "prompt": prompts[i % len(prompts)],
             "multi_modal_data": {
                 modality: cur_image
             }
@@ -683,9 +765,14 @@ def main(args):
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
+    llm, prompts, stop_token_ids = model_example_map[model](questions,
+                                                            modality)
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = prompts if args.use_different_prompt_per_request else [
+        prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -697,27 +784,26 @@ def main(args):
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
         if args.image_repeat_prob is not None:
             # Repeat images with specified probability of "image_repeat_prob"
             inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompt,
+                                        args.num_prompts, data, prompts,
                                         modality)
         else:
             # Use the same image for all prompts
             inputs = [{
-                "prompt": prompt,
+                "prompt": prompts[i % len(prompts)],
                 "multi_modal_data": {
                     modality: data
                 },
-            } for _ in range(args.num_prompts)]
+            } for i in range(args.num_prompts)]
 
     if args.time_generate:
         import time
@@ -775,5 +861,11 @@ def main(args):
         action='store_true',
         help='If True, then print the total generate() call time')
 
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index b1aec33cff46..6fdd4383c1a1 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -4,13 +4,16 @@
 multi-image input on vision language models for text generation,
 using the chat template defined by the model.
 """
+import os
 from argparse import Namespace
 from typing import NamedTuple, Optional
 
+from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
 
 from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 
@@ -294,6 +297,46 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process multi images inputs.
+    """
+
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=10000,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+    )
+    lora_request = LoRARequest("vision", 1, vision_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    placeholders = "".join(f"<|image_{i}|>"
+                           for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_qwen_vl_chat(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
@@ -459,6 +502,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
+    "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 1f72e1164d42..9ceeae8fa96a 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -33,6 +33,42 @@
 models = client.models.list()
 model = models.data[0].id
 
+# Guided decoding by Regex
+prompt = ("What is the capital of France?")
+
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={
+        "guided_regex": "(Paris|London)",
+    },
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+json_schema = People.model_json_schema()
+
+prompt = ("Generate a JSON with the name and age of one random person.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
 
 # Guided decoding by JSON using Pydantic schema
 class CarType(str, Enum):
@@ -51,7 +87,7 @@ class CarDescription(BaseModel):
 json_schema = CarDescription.model_json_schema()
 
 prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's, think in 100 tokens")
+          "the most iconic car from the 90's")
 completion = client.chat.completions.create(
     model=model,
     messages=[{
@@ -60,5 +96,34 @@ class CarDescription(BaseModel):
     }],
     extra_body={"guided_json": json_schema},
 )
-print("content", completion.choices[0].message.content)
 print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+# This may be very slow https://github.com/vllm-project/vllm/issues/12122
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 489bfcd5ec2a..fe4332576d43 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -19,73 +19,50 @@
 where you want to display chat completions to the user as they are generated
 by the model.
 
-Here we do not use the OpenAI Python client library, because it does not support
-`reasoning_content` fields in the response.
+Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
 """
 
-import json
-
-import requests
+from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-models = requests.get(
-    f"{openai_api_base}/models",
-    headers={
-        "Authorization": f"Bearer {openai_api_key}"
-    },
-).json()
-model = models["data"][0]["id"]
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
 
-# Streaming chat completions
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+models = client.models.list()
+model = models.data[0].id
 
-response = requests.post(
-    f"{openai_api_base}/chat/completions",
-    headers={"Authorization": f"Bearer {openai_api_key}"},
-    json={
-        "model": model,
-        "messages": messages,
-        "stream": True
-    },
-)
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+stream = client.chat.completions.create(model=model,
+                                        messages=messages,
+                                        stream=True)
 
 print("client: Start streaming chat completions...")
 printed_reasoning_content = False
 printed_content = False
-# Make the streaming request
-if response.status_code == 200:
-    # Process the streaming response
-    for line in response.iter_lines():
-        if line:  # Filter out keep-alive new lines
-            # Decode the line and parse the JSON
-            decoded_line = line.decode("utf-8")
-            if decoded_line.startswith("data:"):
-                data = decoded_line[5:].strip()  # Remove "data:" prefix
-                if data == "[DONE]":  # End of stream
-                    print("\nclient: Stream completed.")
-                    break
-                try:
-                    # Parse the JSON data
-                    chunk = json.loads(data)
-                    reasoning_content = chunk["choices"][0]["delta"].get(
-                        "reasoning_content", "")
-                    content = chunk["choices"][0]["delta"].get("content", "")
 
-                    if reasoning_content:
-                        if not printed_reasoning_content:
-                            printed_reasoning_content = True
-                            print("reasoning_content:", end="", flush=True)
-                        print(reasoning_content, end="", flush=True)
-                    elif content:
-                        if not printed_content:
-                            printed_content = True
-                            print("\ncontent:", end="", flush=True)
-                        # Extract and print the content
-                        print(content, end="", flush=True)
-                except json.JSONDecodeError:
-                    print("Error decoding JSON:", decoded_line)
-else:
-    print(f"Error: {response.status_code} - {response.text}")
+for chunk in stream:
+    reasoning_content = None
+    content = None
+    # Check the content is reasoning_content or content
+    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif hasattr(chunk.choices[0].delta, "content"):
+        content = chunk.choices[0].delta.content
+
+    if reasoning_content is not None:
+        if not printed_reasoning_content:
+            printed_reasoning_content = True
+            print("reasoning_content:", end="", flush=True)
+        print(reasoning_content, end="", flush=True)
+    elif content is not None:
+        if not printed_content:
+            printed_content = True
+            print("\ncontent:", end="", flush=True)
+        # Extract and print the content
+        print(content, end="", flush=True)
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index bd3c02a8a95e..494e7c8ebe12 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+import asyncio
+import json
+
+import httpx
 from openai import OpenAI
 
 from vllm.assets.audio import AudioAsset
@@ -13,11 +17,50 @@
     api_key=openai_api_key,
     base_url=openai_api_base,
 )
-with open(str(mary_had_lamb), "rb") as f:
-    transcription = client.audio.transcriptions.create(
-        file=f,
-        model="openai/whisper-large-v3",
-        language="en",
-        response_format="text",
-        temperature=0.0)
-    print("transcription result:", transcription)
+
+
+def sync_openai():
+    with open(str(mary_had_lamb), "rb") as f:
+        transcription = client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-small",
+            language="en",
+            response_format="json",
+            temperature=0.0)
+        print("transcription result:", transcription.text)
+
+
+sync_openai()
+
+
+# OpenAI Transcription API client does not support streaming.
+async def stream_openai_response():
+    data = {
+        "language": "en",
+        'stream': True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = openai_api_base + "/audio/transcriptions"
+    print("transcription result:", end=' ')
+    async with httpx.AsyncClient() as client:
+        with open(str(winning_call), "rb") as f:
+            async with client.stream('POST', url, files={'file': f},
+                                     data=data) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith('data: '):
+                            line = line[len('data: '):]
+                        # Last chunk, stream ends
+                        if line.strip() == '[DONE]':
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk['choices'][0].get('delta',
+                                                          {}).get('content')
+                        print(content, end='')
+
+
+# Run the asynchronous function
+asyncio.run(stream_openai_response())
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 7a605f85b97f..a8b353090d79 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -28,7 +28,6 @@
         "model": "facebook/opt-125m",
         "prompt": prompt,
         "max_tokens": 10,
-        "best_of": 20,
         "n": 3,
         "use_beam_search": "true",
         "temperature": 0.0,
diff --git a/format.sh b/format.sh
index 3e78bf9865f0..fb503ec4bbfc 100755
--- a/format.sh
+++ b/format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
 echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "Please run 'pip install -r requirements/lint.txt', followed by"
 echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
 echo "Then linters will run automatically before each commit."
diff --git a/pyproject.toml b/pyproject.toml
index 0ad5055f5e89..5b9f86d6c4d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-# Should be mirrored in requirements-build.txt
+# Should be mirrored in requirements/build.txt
 requires = [
     "cmake>=3.26",
     "ninja",
@@ -63,6 +63,7 @@ exclude = [
 ]
 
 [tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing. TODO: Remove these excludes after v1.0.0
@@ -74,22 +75,16 @@ exclude = [
 "vllm/distributed/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/inputs/**/*.py" = ["UP006", "UP035"]
-"vllm/logging_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/lora/**/*.py" = ["UP006", "UP035"]
 "vllm/model_executor/**/*.py" = ["UP006", "UP035"]
-"vllm/multimodal/**/*.py" = ["UP006", "UP035"]
 "vllm/platforms/**/*.py" = ["UP006", "UP035"]
 "vllm/plugins/**/*.py" = ["UP006", "UP035"]
 "vllm/profiler/**/*.py" = ["UP006", "UP035"]
 "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
-"vllm/third_party/**/*.py" = ["UP006", "UP035"]
 "vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
-"vllm/usage/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
-"vllm/assets/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 "vllm/utils.py" = ["UP006", "UP035"]
 
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
deleted file mode 100644
index 725b1a2e4a58..000000000000
--- a/requirements-tpu.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# Common dependencies
--r requirements-common.txt
-
-# Dependencies for TPU
-cmake>=3.26
-ninja
-packaging
-setuptools-scm>=8
-wheel
-jinja2
-ray[default]
-
-# Install torch_xla
---pre
---extra-index-url https://download.pytorch.org/whl/nightly/cpu
---find-links https://storage.googleapis.com/libtpu-wheels/index.html
---find-links https://storage.googleapis.com/libtpu-releases/index.html
---find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
---find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-torch==2.7.0.dev20250226+cpu
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/requirements-build.txt b/requirements/build.txt
similarity index 100%
rename from requirements-build.txt
rename to requirements/build.txt
diff --git a/requirements-common.txt b/requirements/common.txt
similarity index 95%
rename from requirements-common.txt
rename to requirements/common.txt
index fb84d6d9e7b6..27f5aad96aa3 100644
--- a/requirements-common.txt
+++ b/requirements/common.txt
@@ -1,7 +1,6 @@
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding.
 requests >= 2.26.0
 tqdm
 blake3
@@ -38,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
+scipy # Required for phi-4-multimodal-instruct
\ No newline at end of file
diff --git a/requirements-cpu.txt b/requirements/cpu.txt
similarity index 58%
rename from requirements-cpu.txt
rename to requirements/cpu.txt
index ecfa822e0118..ba059d3ff72e 100644
--- a/requirements-cpu.txt
+++ b/requirements/cpu.txt
@@ -1,15 +1,16 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le"
+torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchaudio==2.5.1; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"
+torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
 torchvision==0.20.1; platform_machine == "ppc64le"
 datasets # for benchmark scripts
diff --git a/requirements-cuda.txt b/requirements/cuda.txt
similarity index 81%
rename from requirements-cuda.txt
rename to requirements/cuda.txt
index 2de06668c3a4..46bb17361b2f 100644
--- a/requirements-cuda.txt
+++ b/requirements/cuda.txt
@@ -1,5 +1,7 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
+
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
diff --git a/requirements-dev.txt b/requirements/dev.txt
similarity index 67%
rename from requirements-dev.txt
rename to requirements/dev.txt
index 421aa2e796ec..e75821eb4a81 100644
--- a/requirements-dev.txt
+++ b/requirements/dev.txt
@@ -1,5 +1,5 @@
--r requirements-lint.txt
--r requirements-test.txt
+-r lint.txt
+-r test.txt
 
 # Avoid adding requirements directly to this file.
 # Instead, modify the two files referenced above.
diff --git a/docs/requirements-docs.txt b/requirements/docs.txt
similarity index 100%
rename from docs/requirements-docs.txt
rename to requirements/docs.txt
diff --git a/requirements-hpu.txt b/requirements/hpu.txt
similarity index 88%
rename from requirements-hpu.txt
rename to requirements/hpu.txt
index 63a5f8b18f6b..a61d72d04f40 100644
--- a/requirements-hpu.txt
+++ b/requirements/hpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for HPU code
 ray
diff --git a/requirements-lint.txt b/requirements/lint.txt
similarity index 100%
rename from requirements-lint.txt
rename to requirements/lint.txt
diff --git a/requirements-neuron.txt b/requirements/neuron.txt
similarity index 77%
rename from requirements-neuron.txt
rename to requirements/neuron.txt
index 09820c73e4e0..5f25bd0546e6 100644
--- a/requirements-neuron.txt
+++ b/requirements/neuron.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for Neuron devices
 torch-neuronx >= 2.5.0
diff --git a/requirements-openvino.txt b/requirements/openvino.txt
similarity index 94%
rename from requirements-openvino.txt
rename to requirements/openvino.txt
index ac9d851d661b..04b8c3b009a2 100644
--- a/requirements-openvino.txt
+++ b/requirements/openvino.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
diff --git a/requirements-rocm-build.txt b/requirements/rocm-build.txt
similarity index 89%
rename from requirements-rocm-build.txt
rename to requirements/rocm-build.txt
index 00ae0340fc52..4d4945b007eb 100644
--- a/requirements-rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.2
 torch==2.5.1
diff --git a/requirements-rocm.txt b/requirements/rocm.txt
similarity index 58%
rename from requirements-rocm.txt
rename to requirements/rocm.txt
index 26400f0fe245..3cda6f07f172 100644
--- a/requirements-rocm.txt
+++ b/requirements/rocm.txt
@@ -1,5 +1,7 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
+
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
 # Dependencies for AMD GPUs
 numpy==1.26.4
@@ -13,4 +15,4 @@ pytest-asyncio
 tensorizer>=2.9.0
 setuptools-scm>=8
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
\ No newline at end of file
+runai-model-streamer-s3==0.11.0
diff --git a/requirements-test.in b/requirements/test.in
similarity index 100%
rename from requirements-test.in
rename to requirements/test.in
diff --git a/requirements-test.txt b/requirements/test.txt
similarity index 90%
rename from requirements-test.txt
rename to requirements/test.txt
index f5722c82e201..f112320725c6 100644
--- a/requirements-test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements-test.in -o requirements-test.txt
+#    uv pip compile requirements/test.in -o requirements/test.txt
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -32,9 +32,9 @@ attrs==24.2.0
 audioread==3.0.1
     # via librosa
 awscli==1.35.23
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 bitsandbytes==0.45.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
 boto3==1.35.57
@@ -47,7 +47,7 @@ botocore==1.35.57
 bounded-pool-executor==0.0.3
     # via pqdm
 buildkite-test-collector==0.1.9
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 certifi==2024.8.30
     # via
     #   httpcore
@@ -79,7 +79,7 @@ cupy-cuda12x==13.3.0
 cycler==0.12.1
     # via matplotlib
 datamodel-code-generator==0.26.3
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 dataproperty==1.0.1
     # via
     #   pytablewriter
@@ -91,7 +91,7 @@ datasets==3.0.2
 decorator==5.1.1
     # via librosa
 decord==0.6.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -104,7 +104,7 @@ docutils==0.16
     # via awscli
 einops==0.8.0
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   encodec
     #   vector-quantize-pytorch
     #   vocos
@@ -145,7 +145,7 @@ fsspec==2024.9.0
     #   huggingface-hub
     #   torch
 genai-perf==0.0.8
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
@@ -155,7 +155,7 @@ hiredis==3.0.0
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 huggingface-hub==0.26.2
     # via
     #   accelerate
@@ -187,7 +187,7 @@ jinja2==3.1.4
     #   datamodel-code-generator
     #   torch
 jiwer==3.0.5
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 jmespath==1.0.1
     # via
     #   boto3
@@ -214,11 +214,11 @@ lazy-loader==0.4
 libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 llvmlite==0.43.0
     # via numba
 lm-eval==0.4.4
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 lxml==5.3.0
     # via sacrebleu
 markdown-it-py==3.0.0
@@ -226,7 +226,7 @@ markdown-it-py==3.0.0
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.9.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 mbstrdecoder==1.1.3
     # via
     #   dataproperty
@@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
 mdurl==0.1.2
     # via markdown-it-py
 mistral-common==1.5.1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 more-itertools==10.5.0
     # via lm-eval
 mpmath==1.3.0
@@ -264,7 +264,7 @@ numexpr==2.10.1
     # via lm-eval
 numpy==1.26.4
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   accelerate
     #   bitsandbytes
     #   contourpy
@@ -366,7 +366,7 @@ patsy==1.0.1
     # via statsmodels
 peft==0.13.2
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   lm-eval
 pillow==10.4.0
     # via
@@ -388,7 +388,7 @@ pooch==1.8.2
 portalocker==2.10.1
     # via sacrebleu
 pqdm==0.2.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 propcache==0.2.0
     # via yarl
 protobuf==5.28.3
@@ -426,7 +426,7 @@ pytablewriter==1.2.0
     # via lm-eval
 pytest==8.3.3
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   buildkite-test-collector
     #   genai-perf
     #   pytest-asyncio
@@ -435,15 +435,15 @@ pytest==8.3.3
     #   pytest-rerunfailures
     #   pytest-shard
 pytest-asyncio==0.24.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-forked==1.6.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-mock==3.14.0
     # via genai-perf
 pytest-rerunfailures==14.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-shard==0.1.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
     #   botocore
@@ -473,7 +473,7 @@ pyyaml==6.0.2
 rapidfuzz==3.12.1
     # via jiwer
 ray==2.43.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 redis==5.2.0
     # via tensorizer
 referencing==0.35.1
@@ -512,9 +512,9 @@ rpds-py==0.20.1
 rsa==4.7.2
     # via awscli
 runai-model-streamer==0.11.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 s3transfer==0.10.3
     # via
     #   awscli
@@ -540,7 +540,7 @@ scipy==1.13.1
     #   statsmodels
     #   vocos
 sentence-transformers==3.2.1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
@@ -557,7 +557,7 @@ sniffio==1.3.1
     #   httpx
 soundfile==0.12.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   librosa
 soxr==0.5.0.post1
     # via librosa
@@ -580,7 +580,7 @@ tenacity==9.0.0
     #   lm-eval
     #   plotly
 tensorizer==2.9.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
 tiktoken==0.7.0
@@ -588,12 +588,12 @@ tiktoken==0.7.0
     #   lm-eval
     #   mistral-common
 timm==1.0.11
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 tokenizers==0.21.0
     # via transformers
 torch==2.5.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   accelerate
     #   bitsandbytes
     #   encodec
@@ -609,7 +609,7 @@ torch==2.5.1
     #   vocos
 torchaudio==2.5.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   encodec
     #   vocos
 torchvision==0.20.1
@@ -630,19 +630,19 @@ tqdm-multiprocess==0.0.11
     # via lm-eval
 transformers==4.48.2
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 triton==3.1.0
     # via torch
 tritonclient==2.51.0
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   genai-perf
 typepy==1.3.2
     # via
@@ -668,9 +668,9 @@ urllib3==2.2.3
     #   responses
     #   tritonclient
 vector-quantize-pytorch==1.21.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 vocos==0.1.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
new file mode 100644
index 000000000000..e8e3b0af5db8
--- /dev/null
+++ b/requirements/tpu.txt
@@ -0,0 +1,25 @@
+# Common dependencies
+-r common.txt
+
+# Dependencies for TPU
+cmake>=3.26
+ninja
+packaging
+setuptools-scm>=8
+wheel
+jinja2
+ray[default]
+
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
+--find-links https://storage.googleapis.com/libtpu-releases/index.html
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/requirements-xpu.txt b/requirements/xpu.txt
similarity index 95%
rename from requirements-xpu.txt
rename to requirements/xpu.txt
index be5cb6a4a99b..265205957be4 100644
--- a/requirements-xpu.txt
+++ b/requirements/xpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 ray >= 2.9
 cmake>=3.26
diff --git a/rocm_patch/libamdhip64.so.6 b/rocm_patch/libamdhip64.so.6
deleted file mode 100644
index b551a2c9d890..000000000000
Binary files a/rocm_patch/libamdhip64.so.6 and /dev/null differ
diff --git a/setup.py b/setup.py
index 067aecb4df13..9bad5df9c245 100755
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def load_module_from_path(module_name, path):
     return module
 
 
-ROOT_DIR = os.path.dirname(__file__)
+ROOT_DIR = Path(__file__).parent
 logger = logging.getLogger(__name__)
 
 # cannot import envs directly because it depends on vllm,
@@ -69,6 +69,18 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
+def is_url_available(url: str) -> bool:
+    from urllib.request import urlopen
+
+    status = None
+    try:
+        with urlopen(url) as f:
+            status = f.status
+    except Exception:
+        return False
+    return status == 200
+
+
 class CMakeExtension(Extension):
 
     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
@@ -320,6 +332,10 @@ def run(self) -> None:
         if wheel_location is None:
             base_commit = self.get_base_commit_in_main_branch()
             wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            # Fallback to nightly wheel if latest commit wheel is unavailable,
+            # in this rare case, the nightly release CI hasn't finished on main.
+            if not is_url_available(wheel_location):
+                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
         import zipfile
 
@@ -504,10 +520,6 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
-def get_path(*filepath) -> str:
-    return os.path.join(ROOT_DIR, *filepath)
-
-
 def get_gaudi_sw_version():
     """
     Returns the driver version.
@@ -575,9 +587,10 @@ def get_vllm_version() -> str:
 
 def get_requirements() -> list[str]:
     """Get Python package dependencies from requirements.txt."""
+    requirements_dir = ROOT_DIR / "requirements"
 
     def _read_requirements(filename: str) -> list[str]:
-        with open(get_path(filename)) as f:
+        with open(requirements_dir / filename) as f:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
         for line in requirements:
@@ -590,9 +603,9 @@ def _read_requirements(filename: str) -> list[str]:
         return resolved_requirements
 
     if _no_device():
-        requirements = _read_requirements("requirements-common.txt")
+        requirements = _read_requirements("common.txt")
     elif _is_cuda():
-        requirements = _read_requirements("requirements-cuda.txt")
+        requirements = _read_requirements("cuda.txt")
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
         for req in requirements:
@@ -603,19 +616,19 @@ def _read_requirements(filename: str) -> list[str]:
             modified_requirements.append(req)
         requirements = modified_requirements
     elif _is_hip():
-        requirements = _read_requirements("requirements-rocm.txt")
+        requirements = _read_requirements("rocm.txt")
     elif _is_neuron():
-        requirements = _read_requirements("requirements-neuron.txt")
+        requirements = _read_requirements("neuron.txt")
     elif _is_hpu():
-        requirements = _read_requirements("requirements-hpu.txt")
+        requirements = _read_requirements("hpu.txt")
     elif _is_openvino():
-        requirements = _read_requirements("requirements-openvino.txt")
+        requirements = _read_requirements("openvino.txt")
     elif _is_tpu():
-        requirements = _read_requirements("requirements-tpu.txt")
+        requirements = _read_requirements("tpu.txt")
     elif _is_cpu():
-        requirements = _read_requirements("requirements-cpu.txt")
+        requirements = _read_requirements("cpu.txt")
     elif _is_xpu():
-        requirements = _read_requirements("requirements-xpu.txt")
+        requirements = _read_requirements("xpu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 89abc001764b..aaf027781090 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -13,7 +13,7 @@
 from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, apply_fp8_linear, maybe_create_device_identity)
+    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
 
 from .backend import TestBackend
 
@@ -34,26 +34,20 @@ def __init__(self, hidden_size: int, eps: float, static: bool,
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_enabled,
+            use_per_token_if_dynamic=True)
 
     def forward(self, x):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y,
-                              self.w[0],
-                              self.wscale[0],
-                              self.scale[0],
-                              use_per_token_if_dynamic=True,
-                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
+        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2,
-                              self.w[1],
-                              self.wscale[1],
-                              self.scale[1],
-                              use_per_token_if_dynamic=True,
-                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
+        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
+                                   self.scale[1])
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 57a33ad08c94..4fbb4132d385 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -257,7 +257,8 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
             return x
 
         if device is None:
-            device = "cpu" if current_platform.is_cpu() else "cuda"
+            device = "cpu" if current_platform.is_cpu(
+            ) or current_platform.is_openvino() else "cuda"
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 9e461d4e0b40..8bd64923fe22 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -617,7 +617,6 @@ def test_schedule_decode_blocks_to_copy_update():
                                      num_gpu_blocks=16)
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
@@ -686,7 +685,6 @@ def test_schedule_swapped_cannot_swap_in():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -717,7 +715,6 @@ def test_infeasible_swap():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -747,7 +744,6 @@ def test_schedule_swapped_blocks_to_copy():
     curr_loras = None
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index ba4265e3c20a..ea18b879a317 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -18,7 +18,6 @@ def create_dummy_prompt(
     prompt_length: int = -1,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
     prompt_tokens: Optional[list[int]] = None,
     min_tokens: int = 0,
     max_tokens: int = 16,
@@ -32,17 +31,19 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
 
     prompt_str = " ".join([str(t) for t in prompt_tokens])
-    prompt = Sequence(int(request_id),
-                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
-                      block_size=block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[prompt],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(
-                                  best_of=best_of,
-                                  max_tokens=max_tokens,
-                                  min_tokens=min_tokens),
-                              lora_request=lora_request)
+    prompt = Sequence(
+        int(request_id),
+        inputs=token_inputs(prompt_tokens, prompt=prompt_str),
+        block_size=block_size,
+    )
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=[prompt],
+        arrival_time=time.time(),
+        sampling_params=SamplingParams(max_tokens=max_tokens,
+                                       min_tokens=min_tokens),
+        lora_request=lora_request,
+    )
 
     return prompt, seq_group
 
@@ -72,7 +73,6 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_length: int,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
 ) -> tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
@@ -102,7 +102,6 @@ def create_dummy_prompt_encoder_decoder(
 
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(best_of=best_of),
                               arrival_time=time.time(),
                               lora_request=lora_request,
                               encoder_seq=encoder_prompt)
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 1c6c28b4ed35..4ef33932538e 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -25,7 +25,8 @@
           tensor_parallel_size=2,
           distributed_executor_backend="external_launcher",
           gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4))
+          swap_space=random.randint(1, 4),
+          seed=0)
 
 outputs = llm.generate(prompts, sampling_params)
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 620355923b47..3ebc5a44d80c 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -42,6 +42,10 @@ def run_test(more_args=None):
             ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+
+
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
@@ -56,6 +60,10 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
             # Limit compilation time for TPU V1
             more_args = "max_num_seqs=64"
 
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
+
         run_test(more_args)
 
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 6438743b6494..d10257761c86 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -34,7 +34,8 @@ def llm():
               max_num_batched_tokens=32768,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.75,
-              enforce_eager=True)
+              enforce_eager=True,
+              seed=0)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index fce581c78288..97ee027bde3b 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -21,7 +21,7 @@
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 902df929e782..e4c087db3d4f 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -20,7 +20,7 @@
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUE = 0.54
 DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
 MORE_ARGS_LIST = [
     [],  # Default
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 39ce4ba23548..2bffd0ce138e 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -239,6 +239,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_generation_tokens_sum",
     "vllm:request_generation_tokens_bucket",
     "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 9b33eddae2a8..6474858642d7 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -17,18 +17,28 @@
 
 
 @pytest.fixture(scope="module")
-def server_with_return_tokens_as_token_ids_flag(
-        default_server_args):  # noqa: F811
-    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
-    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
-        yield remote_server
+def server_fixture(request, default_server_args):  # noqa: F811
+    use_server_flag = request.param
+    if use_server_flag:
+        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+        with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+            yield (remote_server, True)
+    else:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield (remote_server, False)
 
 
 @pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
 async def test_completion_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+        server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
 
         completion = await client.completions.create(
             model=MODEL_NAME,
@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
             echo=True,
             temperature=0,
             max_tokens=10,
-            logprobs=1)
+            logprobs=1,
+            extra_body=request_args)
 
         text = completion.choices[0].text
         token_strs = completion.choices[0].logprobs.tokens
@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
 
 
 @pytest.mark.asyncio
-async def test_chat_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
         response = await client.chat.completions.create(
             model=MODEL_NAME,
             # Include Unicode characters to test for dividing a single
@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
             }],
             temperature=0,
             max_tokens=8,
-            logprobs=True)
+            logprobs=True,
+            extra_body=request_args)
 
         text = response.choices[0].message.content
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 1e7dbaf60dc0..19d16713b209 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -38,6 +38,7 @@ class MockModelConfig:
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
     encoder_config = None
+    generation_config: str = "auto"
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 5d4a5de4badd..29571bcd7649 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -3,12 +3,14 @@
 # imports for guided decoding tests
 import io
 import json
+from unittest.mock import patch
 
 import librosa
 import numpy as np
 import openai
 import pytest
 import soundfile as sf
+from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
@@ -120,3 +122,73 @@ async def test_completion_endpoints():
         res = await client.completions.create(model=model_name, prompt="Hello")
         assert res.code == 400
         assert res.message == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    transcription = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.transcriptions.create(
+            model=model_name,
+            file=winning_call,
+            response_format="json",
+            language="en",
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the transcription api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                transcription += text
+
+        assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index fd282e341aa1..31ea280bd048 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -17,6 +17,8 @@
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
+    from vllm.attention.backends.xformers import _make_alibi_bias
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -353,20 +355,26 @@ def ref_multi_query_kv_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
+    alibi_bias: Optional[list[torch.Tensor]],
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
     ref_outputs: list[torch.Tensor] = []
+    if alibi_bias:
+        assert len(alibi_bias) == num_seqs
     for i in range(num_seqs):
         start_idx = cu_seq_lens[i]
         end_idx = cu_seq_lens[i + 1]
         seq_len = end_idx - start_idx
 
-        # Create attention mask.
-        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                               diagonal=1)
-        attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype)
+        # Create attention mask. ALiBi already includes a tril causal mask.
+        if alibi_bias:
+            attn_mask = alibi_bias[i]
+        else:
+            attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
+                                   diagonal=1)
+            attn_mask = attn_mask * torch.finfo(dtype).min
+            attn_mask = attn_mask.to(dtype=dtype)
 
         ref_output = ref_masked_attention(
             query[start_idx:end_idx],
@@ -380,7 +388,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-# TODO(woosuk): Add tests for USE_ALIBI=True.
 @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -397,6 +404,7 @@ def test_multi_query_kv_attention(
     dtype: torch.dtype,
     seed: int,
     device: str,
+    use_alibi: bool = False,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -422,16 +430,40 @@ def test_multi_query_kv_attention(
         # Handle MQA and GQA
         key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-    output = xops.memory_efficient_attention_forward(
-        query.unsqueeze(0),
-        key.unsqueeze(0),
-        value.unsqueeze(0),
-        attn_bias=attn_bias,
-        p=0.0,
-        scale=scale,
-    )
-    output = output.squeeze(0)
+    alibi_bias = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
+                                     seq_lens)
+        output = torch.empty_like(query)
+        start = 0
+        # Dynamic sequence length not supported with custom attn_bias.
+        for i, seq_len in enumerate(seq_lens):
+            end = start + seq_len
+            out = xops.memory_efficient_attention_forward(
+                query[None, start:end],
+                key[None, start:end],
+                value[None, start:end],
+                attn_bias=attn_bias[i],
+                p=0.0,
+                scale=scale)
+            output[start:end].copy_(out.view_as(query[start:end]))
+            start += seq_len
+        # xformers.AttentionBias to Tensor for use in reference impl.
+        alibi_bias = [
+            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+        ]
+    else:
+        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
+        output = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=scale,
+        )
+        output = output.squeeze(0)
 
     cu_seq_lens = [0]
     for seq_len in seq_lens:
@@ -442,8 +474,37 @@ def test_multi_query_kv_attention(
         key,
         value,
         scale,
+        alibi_bias,
         dtype,
     )
     atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
     rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
\ No newline at end of file
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+@torch.inference_mode()
+def test_multi_query_kv_attention_with_alibi(
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    return test_multi_query_kv_attention(
+        num_seqs,
+        num_heads,
+        head_size,
+        dtype,
+        seed,
+        device,
+        use_alibi=True,
+    )
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index ace75a336173..37ce00c74030 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -26,6 +26,7 @@ def test_awq_dequantize_opcheck():
             (qweight, scales, zeros, split_k_iters, thx, thy))
 
 
+@pytest.mark.skip(reason="Not working; needs investigation.")
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                     reason="AWQ is not supported on this GPU type.")
 def test_awq_gemm_opcheck():
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index aa666a464a5e..dde3741d3c4f 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -22,7 +22,7 @@ def get_gguf_sample_tensors(
     return GGUFReader(sample_file).tensors
 
 
-DTYPES = [torch.half]
+DTYPES = [torch.half, torch.bfloat16, torch.float32]
 # Hidden_size for testing, must match the sample file in HF repo,
 # we have `hidden_size = 256, 1024` for test in HF repo currently.
 HIDDEN_SIZES = [256, 1024]
@@ -52,7 +52,7 @@ def get_gguf_sample_tensors(
 
 
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dtype", [torch.half])
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -122,7 +122,13 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
-                                     qweight.shape[0]).to(dtype)
-
-        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
+        output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0])
+        atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2}
+        # test matrix has inputs centered around 0 and lower precision from
+        # bfloat16 tends to accumulate and can greatly inflate rtol
+        # since outputs are also very close to 0
+        rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
+        torch.testing.assert_close(output,
+                                   ref_output,
+                                   atol=atols[dtype],
+                                   rtol=rtols[dtype])
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index b96aca06cdff..c0cf5b099f99 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -34,6 +34,7 @@
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
+USE_ATOMIC_ADD_OPTS = [False, True]
 USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
@@ -194,6 +195,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
+@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_gptq_marlin_gemm(
     k_chunk,
@@ -203,6 +205,7 @@ def test_gptq_marlin_gemm(
     mnk_factors,
     act_order,
     is_k_full,
+    use_atomic_add,
     use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
@@ -228,12 +231,12 @@ def test_gptq_marlin_gemm(
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
 
-    opcheck(
-        torch.ops._C.gptq_marlin_gemm,
-        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
-         workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
-         a_input.shape[1], is_k_full, False, use_fp32_reduce, False),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(torch.ops._C.gptq_marlin_gemm,
+            (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
+             workspace.scratch, quant_type.id, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1], is_k_full, False,
+             use_atomic_add, use_fp32_reduce, False),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
 
     output = ops.gptq_marlin_gemm(
         a_input,
@@ -249,6 +252,7 @@ def test_gptq_marlin_gemm(
         a_input.shape[1],
         is_k_full=is_k_full,
         has_zp=False,
+        use_atomic_add=use_atomic_add,
         use_fp32_reduce=use_fp32_reduce,
         is_zp_float=False,
     )
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 87989354db6a..594b7fb8e049 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -241,6 +241,7 @@ def test_mixtral_moe(dtype: torch.dtype):
         intermediate_size=config.intermediate_size,
         params_dtype=dtype,
         tp_size=1,
+        dp_size=1,
     ).cuda()
 
     # Load the weights
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index c3ac6a37e717..50eaa92f59b5 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -3,6 +3,7 @@
 import math
 import random
 import time
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -10,6 +11,8 @@
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 from vllm.attention.backends.xformers import _make_alibi_bias
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@@ -24,6 +27,8 @@
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
+OPS = [chunked_prefill_paged_decode, context_attention_fwd]
+
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
@@ -32,6 +37,7 @@
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     num_heads: int,
@@ -41,6 +47,7 @@ def test_contexted_kv_attention(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
 
     if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
@@ -65,6 +72,9 @@ def test_contexted_kv_attention(
     block_size = 32
     max_block_per_request = 64
     query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    # ensure one sequence in batch is a decode
+    query_lens[-1] = 1
+
     ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
     num_kv_heads = num_heads // num_queries_per_kv
@@ -144,36 +154,36 @@ def test_contexted_kv_attention(
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          sliding_window=sliding_window)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
     torch.cuda.synchronize()
     start_time = time.time()
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          sliding_window=sliding_window)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
@@ -228,7 +238,7 @@ def test_contexted_kv_attention(
     end_time = time.time()
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
     output_ref = output_ref.reshape(output.shape)
-    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
 
@@ -238,6 +248,7 @@ def test_contexted_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi(
     num_heads: int,
@@ -246,6 +257,7 @@ def test_contexted_kv_attention_alibi(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
 
     if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
@@ -375,36 +387,36 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          alibi_slopes=alibi_slopes)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     start_time = time.time()
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          alibi_slopes=alibi_slopes)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
@@ -439,14 +451,16 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # heads.
         #
         # see also: vllm/model_executor/layers/attention.py
-        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
-                           query.shape[-1])
         key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
                                         num_queries_per_kv, key.shape[-1])
         value = value[:, :,
                       None, :].expand(value.shape[0], num_kv_heads,
                                       num_queries_per_kv, value.shape[-1])
-
+        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
+        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
+        # codebase. We save some time reshaping alibi matrix at runtime.
+        key = key.reshape(key.shape[0], -1, key.shape[-1])
+        value = value.reshape(value.shape[0], -1, value.shape[-1])
     query = query.unsqueeze(0)
     key = key.unsqueeze(0)
     value = value.unsqueeze(0)
@@ -501,6 +515,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_f32(
     num_heads: int,
@@ -510,9 +525,11 @@ def test_contexted_kv_attention_f32(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
     test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
-                                sliding_window, dtype, kv_cache_dtype, device)
+                                sliding_window, dtype, kv_cache_dtype, device,
+                                op)
 
 
 @pytest.mark.optional
@@ -522,6 +539,7 @@ def test_contexted_kv_attention_f32(
 @pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi_f32(
     num_heads: int,
@@ -530,6 +548,7 @@ def test_contexted_kv_attention_alibi_f32(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
     test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
-                                      dtype, kv_cache_dtype, device)
+                                      dtype, kv_cache_dtype, device, op)
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index dd14abff630c..25665517fee2 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -6,7 +6,6 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-import safetensors
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
@@ -186,34 +185,6 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
-@pytest.fixture(scope="session")
-def mixtral_lora_files_all_target_modules():
-    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
-
-
-@pytest.fixture(scope="session")
-def jamba_lora_files():
-    #   some of the adapters have unnecessary weights for serving,
-    #   hence we remove them
-    def remove_unnecessary_weights(path):
-        lora_path = f"{adapter_path}/adapter_model.safetensors"
-        tensors = safetensors.torch.load_file(lora_path)
-        nonlora_keys = []
-        for k in list(tensors.keys()):
-            if "lora" not in k:
-                nonlora_keys.append(k)
-        for k in nonlora_keys:
-            del tensors[k]
-        safetensors.torch.save_file(tensors, lora_path)
-
-    adapter_path = snapshot_download(
-        repo_id=
-        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
-
-    remove_unnecessary_weights(adapter_path)
-    return adapter_path
-
-
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fc0434e7a7e3..6bc9bf788761 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -55,7 +55,6 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -75,7 +74,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@@ -97,7 +95,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
deleted file mode 100644
index 885851880b59..000000000000
--- a/tests/lora/test_jamba.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
-
-MAX_TOKENS = 40
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: list[str]) -> list[str]:
-
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.parametrize("tp_size", [4])
-def test_jamba_lora(jamba_lora_files, tp_size):
-    """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    prompts = ["Write a story about a sheep and a goat."]
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        distributed_executor_backend="ray",
-        tensor_parallel_size=tp_size,
-    )
-
-    expected_jamba_output = [
-        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
-    ]
-    assert do_sample(llm, jamba_lora_files, lora_id=1,
-                     prompts=prompts) == expected_jamba_output
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 3507d0121212..428a1c71d098 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -632,6 +632,7 @@ def create_random_linear_replicated_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_replicated_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -757,6 +758,7 @@ def create_random_linear_parallel_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_parallel_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -904,6 +906,7 @@ class FakeConfig:
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
         linear, lora_linear = create_column_parallel_packed_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
             id_to_index,
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index caa65f2dc635..aea7691935df 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -10,6 +10,14 @@
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
               prompts: list[str]) -> list[str]:
 
@@ -61,45 +69,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                      prompts=prompts) == expected_lora_output
     assert do_sample(llm, mixtral_lora_files, lora_id=2,
                      prompts=prompts) == expected_lora_output
-
-
-@pytest.mark.parametrize("tp_size", [4])
-@pytest.mark.parametrize("fully_shard", [True, False])
-def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size, fully_shard):
-    """This LoRA model has all supported Mixtral target modules"""
-
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    prompts = [
-        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
-        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
-        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
-    ]
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        distributed_executor_backend="ray",
-        tensor_parallel_size=tp_size,
-        fully_sharded_loras=fully_shard,
-        max_lora_rank=32,
-    )
-
-    expected_lora_output = [
-        "A: Nothing happens if you touch the eyes of a blind man.",
-        "A: add heat",
-        "1: Craig",
-    ]
-
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=2,
-                     prompts=prompts) == expected_lora_output
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index b4f3d8dc478a..d607bf66ebd4 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -178,7 +178,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                  model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-
+    if model.quantization == "GPTQ":
+        pytest.skip("GPTQ lora outputs are just incredibly unstable")
     llm_tp1 = vllm.LLM(
         model=model.model_path,
         enable_lora=True,
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 24eff013e204..90735d55be71 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -12,6 +12,14 @@
 from vllm.platforms import current_platform
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @dataclass
 class TestConfig:
     model_path: str
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index 6d2833bd125f..2faabcb031f7 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -4,6 +4,7 @@
 from os import path
 from tempfile import TemporaryDirectory
 
+import pytest
 import torch
 from huggingface_hub import snapshot_download
 from safetensors.torch import load_file, save_file
@@ -21,6 +22,14 @@
 PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def llama3_1_8b_chess_lora_path():
     return snapshot_download(
         repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 797141ea39e8..fc1be4ed440a 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,18 +3,45 @@
 import os
 import random
 import tempfile
+from typing import Union
 from unittest.mock import patch
 
+import pytest
+
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
+
+    def set_active_loras(worker: Union[Worker, V1Worker],
+                         lora_requests: list[LoRARequest]):
+        lora_mapping = LoRAMapping([], [])
+        if isinstance(worker, Worker):
+            # v0 case
+            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
+        else:
+            # v1 case
+            worker.model_runner.lora_manager.set_active_adapters(
+                lora_requests, lora_mapping)
+
+    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+
     vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
@@ -40,16 +67,17 @@ def test_worker_apply_lora(sql_lora_files):
         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                                max_loras=32),
     )
-    worker = Worker(
+    worker = worker_cls(
         vllm_config=vllm_config,
         local_rank=0,
         rank=0,
         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
     )
+
     worker.init_device()
     worker.load_model()
 
-    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    set_active_loras(worker, [])
     assert worker.list_loras() == set()
 
     n_loras = 32
@@ -57,7 +85,7 @@ def test_worker_apply_lora(sql_lora_files):
         LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
     ]
 
-    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    set_active_loras(worker, lora_requests)
     assert worker.list_loras() == {
         lora_request.lora_int_id
         for lora_request in lora_requests
@@ -69,8 +97,7 @@ def test_worker_apply_lora(sql_lora_files):
                                             k=random.randint(1, n_loras))
         random.shuffle(iter_lora_requests)
         iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
-        worker.model_runner.set_active_loras(iter_lora_requests,
-                                             LoRAMapping([], []))
+        set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
             {lora_request.lora_int_id
              for lora_request in iter_lora_requests})
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3f7a7c01aebc..2540933bbc23 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -116,9 +116,8 @@
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
-               else ("half", "float")),
-        marks=[pytest.mark.core_model],
+        dtype="bfloat16",
+        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
     ),
     # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
     # once we upgraded to transformers>=4.49.0.
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 7534f0c97798..629d1012d18e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -175,6 +175,8 @@ def _test_processing_correctness(
     "Qwen/Qwen2-Audio-7B-Instruct",
     "fixie-ai/ultravox-v0_4",
     "openai/whisper-large-v3",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b5ded20c5af5..3c3247eaf3e9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -131,6 +131,8 @@ def check_available_online(
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts",  # noqa: E501
+                                                   min_transformers_version="4.49"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
@@ -270,6 +272,8 @@ def check_available_online(
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True),
+    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
+                                        trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
     "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index ba3df86f715a..a358eee5ddb6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -873,7 +873,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_dummy_data(model_config.max_model_len)
+        profiler.get_decoder_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/tests/test_config.py b/tests/test_config.py
index 709d60b83670..06264c5b99b9 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -289,7 +289,7 @@ def test_uses_mrope(model_id, uses_mrope):
 def test_generation_config_loading():
     model_id = "Qwen/Qwen2.5-1.5B-Instruct"
 
-    # When set generation_config to None, the default generation config
+    # When set generation_config to "vllm", the default generation config
     # will not be loaded.
     model_config = ModelConfig(model_id,
                                task="auto",
@@ -298,7 +298,7 @@ def test_generation_config_loading():
                                trust_remote_code=False,
                                seed=0,
                                dtype="float16",
-                               generation_config=None)
+                               generation_config="vllm")
     assert model_config.get_diff_sampling_param() == {}
 
     # When set generation_config to "auto", the default generation config
@@ -340,7 +340,7 @@ def test_generation_config_loading():
 
     assert model_config.get_diff_sampling_param() == override_result
 
-    # When generation_config is set to None and override_generation_config
+    # When generation_config is set to "vllm" and override_generation_config
     # is set, the override_generation_config should be used directly.
     model_config = ModelConfig(
         model_id,
@@ -350,7 +350,7 @@ def test_generation_config_loading():
         trust_remote_code=False,
         seed=0,
         dtype="float16",
-        generation_config=None,
+        generation_config="vllm",
         override_generation_config=override_generation_config)
 
     assert model_config.get_diff_sampling_param() == override_generation_config
diff --git a/tests/test_utils.py b/tests/test_utils.py
index f0643989ce98..9127ea36e811 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,7 +14,7 @@
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                         PlaceholderModule, StoreBoolean, bind_kv_cache,
                         deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw)
+                        merge_async_iterators, supports_kw, swap_dict_values)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -466,3 +466,26 @@ def build_ctx():
     with build_ctx():
         # Test conflict with internal __module attribute
         _ = placeholder_attr.module
+
+
+@pytest.mark.parametrize(
+    "obj,key1,key2",
+    [
+        # Tests for both keys exist
+        ({1: "a", 2: "b"}, 1, 2),
+        # Tests for one key does not exist
+        ({1: "a", 2: "b"}, 1, 3),
+        # Tests for both keys do not exist
+        ({1: "a", 2: "b"}, 3, 4),
+    ])
+def test_swap_dict_values(obj, key1, key2):
+    original_obj = obj.copy()
+    swap_dict_values(obj, key1, key2)
+    if key1 in original_obj:
+        assert obj[key2] == original_obj[key1]
+    else:
+        assert key2 not in obj
+    if key2 in original_obj:
+        assert obj[key1] == original_obj[key2]
+    else:
+        assert key1 not in obj
diff --git a/tests/utils.py b/tests/utils.py
index 5a97636eec64..fc19c8d031b1 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -76,6 +76,7 @@ def __init__(self,
                  vllm_serve_args: list[str],
                  *,
                  env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
                  auto_port: bool = True,
                  max_wait_seconds: Optional[float] = None) -> None:
         if auto_port:
@@ -87,6 +88,12 @@ def __init__(self,
             vllm_serve_args = vllm_serve_args + [
                 "--port", str(get_open_port())
             ]
+        if seed is not None:
+            if "--seed" in vllm_serve_args:
+                raise ValueError("You have manually specified the seed "
+                                 f"when `seed={seed}`.")
+
+            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
 
         parser = FlexibleArgumentParser(
             description="vLLM's remote OpenAI server.")
@@ -698,7 +705,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     without enough resources, or called when filtering tests to run directly.
     """
     try:
-        if current_platform.is_cpu():
+        if current_platform.is_cpu() or current_platform.is_openvino():
             memory_gb = 0
         else:
             memory_gb = current_platform.get_device_total_memory() / GB_bytes
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index cce2fb2c4814..6129752bcdd6 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
 
+from typing import Optional
+
 import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -15,7 +17,8 @@
 def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
-                 mm_hashes=None):
+                 mm_hashes=None,
+                 prompt_logprobs: Optional[int] = None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -28,7 +31,8 @@ def make_request(request_id,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17),
+        sampling_params=SamplingParams(max_tokens=17,
+                                       prompt_logprobs=prompt_logprobs),
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
@@ -144,6 +148,110 @@ def test_prefill():
     assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
+def test_prefill_plp():
+    '''Test prefill with APC and some prompt logprobs (plp) requests.
+
+    1. Schedule plp request and validate APC block allocation
+    2. Schedule non-plp request and validate blocks
+    3. Schedule plp request; no hit should occur; validate blocks
+    '''
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Request #0 is a prompt logprobs request
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids, prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0_block_hashes = [b.block_hash for b in blocks]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for block_id in (0, 1, 2):
+        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial/preallocated block metadata
+    for block_id in (3, 4):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Request #1 is a non-prompt-logprobs request:
+    # Cache hit in the common prefix when the original block is still in use.
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
+    assert [b.block_id for b in blocks] == [5, 6]
+    for block in computed_blocks:
+        assert block.ref_cnt == 2
+
+    # At this point, we should have 3 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+
+    manager.free(req0)
+    manager.free(req1)
+
+    # All blocks should be available.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    # The order should be
+    # [unallocated (7, 8, 9)]
+    # [unique_req0 (4, 3)]
+    # [unique_req1 (6, 5)]
+    # [common (2, 1, 0)]
+    assert [
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+
+    # Request #2 is a prompt-logprobs request:
+    # NO cache hit in the common prefix; duplicates request #0 cached blocks
+    unique_token_ids = [3] * 6
+    req2 = make_request("2",
+                        common_token_ids + unique_token_ids,
+                        prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req2, 55, computed_blocks)
+    block_ids = [b.block_id for b in blocks]
+    # Duplicate cached blocks have different ids but same hashes vs request #0
+    assert [b.block_hash for b in blocks] == req0_block_hashes
+    assert block_ids != [0, 1, 2, 3, 4]
+
+    # Request #2 block hashes are valid since request #0 hashes are.
+    # Check block reference counts.
+    for block_id in block_ids:
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    manager.free(req2)
+
+
 def test_decode():
     manager = KVCacheManager(
         block_size=16,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f45c21ab75ba..9413373390fe 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+import pytest
+
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
 
 EOS_TOKEN_ID = 50256
 
@@ -15,7 +18,21 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
 ) -> Scheduler:
+    '''Create scheduler under test.
+    
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      :class:`Scheduler` instance
+    '''
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
@@ -30,31 +47,44 @@ def create_scheduler(
         dtype="float16",
         seed=42,
     )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
+        **kwargs_cache,
+    )
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
     )
     cache_config.num_gpu_blocks = 10000
-    return Scheduler(scheduler_config,
-                     model_config,
-                     cache_config,
-                     speculative_config=None,
-                     lora_config=None,
-                     log_stats=True)
-
-
-def create_requests(
-    num_requests: int,
-    num_tokens: int = 10,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    max_tokens: int = 16,
-    stop_token_ids: Optional[list[int]] = None,
-):
+    return Scheduler(
+        scheduler_config,
+        model_config,
+        cache_config,
+        speculative_config=None,
+        lora_config=None,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_requests(num_requests: int,
+                    num_tokens: int = 10,
+                    mm_positions: Optional[list[PlaceholderRange]] = None,
+                    max_tokens: int = 16,
+                    stop_token_ids: Optional[list[int]] = None,
+                    prompt_logprobs: Optional[int] = None):
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
         if mm_positions is not None:
@@ -113,9 +143,18 @@ def test_get_num_unfinished_requests():
         assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
 
 
-def test_schedule():
-    scheduler = create_scheduler()
-    requests = create_requests(num_requests=10)
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule(enable_prefix_caching: Optional[bool],
+                  prompt_logprobs: Optional[int]):
+    '''Test scheduling. 
+    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
+    '''
+    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
+    requests = create_requests(num_requests=10,
+                               prompt_logprobs=prompt_logprobs)
     for request in requests:
         scheduler.add_request(request)
 
@@ -249,7 +288,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -299,7 +340,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -347,7 +390,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -392,7 +437,9 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[])
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
@@ -410,14 +457,21 @@ def test_stop_via_update_from_output():
     assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
 
 
-def test_schedule_concurrent_batches():
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
+                                     prompt_logprobs: Optional[int]):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
         max_num_seqs=2,
+        enable_prefix_caching=enable_prefix_caching,
     )
     requests = create_requests(
         num_requests=2,
         num_tokens=512,
+        prompt_logprobs=prompt_logprobs,
     )
 
     # Schedule the first request.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index e7b91aeb0fbd..0de0026eb284 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -6,7 +6,6 @@
 
 import pytest
 
-from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -72,41 +71,6 @@ async def generate(engine: AsyncLLM,
     return count, request_id
 
 
-@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.asyncio
-async def test_async_llm_refuses_prompt_logprobs_with_apc(
-        monkeypatch, output_kind: RequestOutputKind):
-    """Test passes if AsyncLLM raises an exception when it is configured
-    for automatic prefix caching and it receives a request with
-    prompt_logprobs enabled, which is incompatible."""
-    # TODO(rickyx): Remove monkeypatch VLLM_USE_V1 setting once we have a
-    # better way to test V1 so that in the future when we switch, we don't
-    # have to change all the tests.
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    # Create AsyncLLM engine with APC
-    apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
-                                      enable_prefix_caching=True,
-                                      gpu_memory_utilization=0.8,
-                                      disable_log_requests=True)
-    engine = AsyncLLM.from_engine_args(apc_engine_args)
-    try:
-        with pytest.raises(ValueError) as excinfo:
-            # Issue a request with prompt logprobs enabled, which should fail
-            await asyncio.create_task(
-                generate(engine,
-                         "request-0",
-                         TEXT_PROMPT,
-                         output_kind,
-                         10,
-                         prompt_logprobs=5))
-        # Validate exception string is correct
-        assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
-    finally:
-        # Shut down engine
-        engine.shutdown()
-
-
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.parametrize("engine_args_and_prompt",
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 11c22effb122..5fdbcf5b9963 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -102,14 +102,24 @@ def test_engine_core(monkeypatch):
         engine_core.add_request(req)
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         _ = engine_core.step()
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 1
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         engine_core.abort_requests([request_id])
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert engine_core.scheduler.has_finished_requests()
+
+        _ = engine_core.step()
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         # Add, step, abort 1 of the 3.
         req0 = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 3880a3dd9b8a..e646ccbd4603 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -50,7 +50,7 @@ def loop_until_done(client: EngineCoreClient, outputs: dict):
         engine_core_outputs = client.get_output().outputs
 
         if len(engine_core_outputs) == 0:
-            break
+            continue
 
         all_finished = True
         for out in engine_core_outputs:
@@ -68,7 +68,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
         engine_core_outputs = (await client.get_output_async()).outputs
 
         if len(engine_core_outputs) == 0:
-            break
+            continue
 
         all_finished = True
         for out in engine_core_outputs:
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 33c884e6de35..5446653cc3a2 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -5,7 +5,6 @@
 
 import pytest
 
-from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import LLM, SamplingParams
 
 MODEL = "facebook/opt-125m"
@@ -15,8 +14,6 @@
 def _vllm_model(apc: bool, vllm_runner, monkeypatch):
     """Set up VllmRunner instance."""
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    # TODO(nick): Single-proc to work around a ZMQ shutdown hang for now.
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     return vllm_runner(
         MODEL,
         dtype=DTYPE,
@@ -100,17 +97,3 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
                 f" {n}. Repeats: {repeats}")
-
-
-def test_llm_engine_refuses_prompt_logprobs_with_apc(vllm_model_apc):
-    """Test passes if LLMEngine raises an exception when it is configured
-    for automatic prefix caching and it receives a request with
-    prompt_logprobs enabled, which is incompatible."""
-    model: LLM = vllm_model_apc.model
-    with pytest.raises(ValueError) as excinfo:
-        model.generate(
-            "Hello, my name is",
-            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5))
-
-    # Validate exception string is correct
-    assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 02baa4801a47..f0e344cfa6fc 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -30,9 +30,6 @@
 STOP_STRINGS = ["I love working on", "company by far", "brother in"]
 PROMPT_LEN = 5
 
-PLP_APC_UNSUPPORTED_MSG = ("Prefix caching with prompt logprobs not yet "
-                           "supported on VLLM V1.")
-
 random.seed(42)
 
 
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index b00e168db9d3..6d4278b4c871 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -29,6 +29,7 @@ def sample_regex():
             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 
+# Note: Ensure this only uses attributes compatible with xgrammar
 @pytest.fixture
 def sample_json_schema():
     return {
@@ -44,9 +45,7 @@ def sample_json_schema():
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
+                }
             },
             "work_history": {
                 "type": "array",
@@ -71,8 +70,9 @@ def sample_json_schema():
     }
 
 
+# A schema unsupported by xgrammar
 @pytest.fixture
-def sample_complex_json_schema():
+def unsupported_json_schema():
     return {
         "type": "object",
         "properties": {
@@ -150,7 +150,19 @@ def sample_guided_choice():
 
 
 @pytest.fixture
-def sample_sql_statements():
+def sample_sql_ebnf():
+    return """
+root ::= select_statement
+select_statement ::= "SELECT" column "from" table "where" condition
+column ::= "col_1" | "col_2"
+table ::= "table_1" | "table_2"
+condition ::= column "=" number
+number ::= "1" | "2"
+"""
+
+
+@pytest.fixture
+def sample_sql_lark():
     return ("""
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
diff --git a/tests/v1/entrypoints/llm/__init__.py b/tests/v1/entrypoints/llm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
new file mode 100644
index 000000000000..871739bcf164
--- /dev/null
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+
+import jsonschema
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_completion(monkeypatch, sample_json_schema,
+                                guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_object(monkeypatch, guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=100,
+                                     n=2,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json_object=True,
+                                         backend=guided_decoding_backend))
+
+    outputs = llm.generate(
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_unsupported_schema(monkeypatch, unsupported_json_schema,
+                                        guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=unsupported_json_schema,
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="The provided JSON schema contains features "
+                       "not supported by xgrammar."):
+        llm.generate(prompts=[
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {unsupported_json_schema}"
+        ] * 2,
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_ebnf(monkeypatch, sample_sql_ebnf,
+                             guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_ebnf,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts=("Generate a sql statement that selects col_1 from "
+                 "table_1 where it is equal to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_lark(monkeypatch, sample_sql_lark,
+                             guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_lark,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts=("Generate a sql statement that selects col_1 from "
+                 "table_1 where it is equal to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_lark)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_ebnf_invalid(monkeypatch,
+                                     guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar="not a grammar",
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="Failed to convert the grammar "
+                       "from Lark to EBNF."):
+        llm.generate(
+            prompts=("Generate a sql statement that selects col_1 from "
+                     "table_1 where it is equal to 1"),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_regex(monkeypatch, sample_regex, guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="Regex guided decoding is not supported."):
+        llm.generate(prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+    # Once regex is supported --
+    #assert outputs is not None
+    #for output in outputs:
+    #    assert output is not None
+    #    assert isinstance(output, RequestOutput)
+    #    prompt = output.prompt
+    #    generated_text = output.outputs[0].text
+    #    print(generated_text)
+    #    assert generated_text is not None
+    #    assert re.fullmatch(sample_regex, generated_text) is not None
+    #    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_choice_completion(monkeypatch, sample_guided_choice,
+                                  guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         choice=sample_guided_choice,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index d564a8c2e7a7..9715573e3f14 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,24 +1,34 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
+from collections.abc import Generator
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
 from tests.v1.sample.utils import (
+    BatchLogprobsComposition, BatchLogprobsSpecType,
     assert_incr_detok_str_matches_non_incr_detok_str,
     compute_correct_cumulative_logprob, get_test_batch)
 from vllm import SamplingParams
 
-from ...conftest import VllmRunner
+from ...conftest import HfRunner, VllmRunner
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 DTYPE = "half"
 
+NONE = BatchLogprobsComposition.NONE
+SAMPLE = BatchLogprobsComposition.SAMPLE
+PROMPT = BatchLogprobsComposition.PROMPT
+SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
 
-@pytest.fixture(scope="module")
-def vllm_model(vllm_runner):
+
+@pytest.fixture(
+    scope="module",
+    # Parameterize APC
+    params=[False, True])
+def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
     with vllm_runner(
             MODEL,
             dtype=DTYPE,
@@ -31,22 +41,22 @@ def vllm_model(vllm_runner):
             enforce_eager=True,
             #TODO: enable this once we support it for
             # prompt logprobs.
-            enable_prefix_caching=False,
+            enable_prefix_caching=request.param,
             gpu_memory_utilization=0.5,
     ) as vllm_model:
         yield vllm_model
 
 
 @pytest.fixture(scope="module")
-def hf_model(hf_runner):
+def hf_model(hf_runner) -> Generator[HfRunner, None, None]:
     with hf_runner(MODEL, dtype=DTYPE) as hf_model:
         yield hf_model
 
 
 def _repeat_logprob_config(
     test_prompts,
-    logprob_prompt_logprob_list: list[tuple],
-) -> list[tuple]:
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+) -> BatchLogprobsSpecType:
     """Ensure each test prompt has a logprob config.
     
     A logprob config specifies the optional (i.e.
@@ -91,42 +101,17 @@ def _repeat_logprob_config(
     return logprob_prompt_logprob_list
 
 
-def _test_case_get_logprobs_and_prompt_logprobs(
-    hf_model,
-    vllm_model,
-    batch_logprobs_composition: str,
+def _run_and_validate(
+    vllm_model: VllmRunner,
+    test_prompts: list[str],
+    vllm_sampling_params: SamplingParams,
+    hf_logprobs: list[list[torch.Tensor]],
+    hf_outputs: list[tuple[list[int], str]],
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
     temperature: float,
-    example_prompts,
+    max_tokens: int,
+    do_apc: bool,
 ) -> None:
-    test_prompts = example_prompts
-
-    max_tokens = 5
-    hf_outputs = hf_model.generate_greedy(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-
-    # Batch has mixed sample params
-    # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-
-    # Ensure that each test prompt has a logprob config for testing
-    logprob_prompt_logprob_list = _repeat_logprob_config(
-        test_prompts, logprob_prompt_logprob_list)
-    # Generate SamplingParams
-    vllm_sampling_params = [
-        SamplingParams(max_tokens=max_tokens,
-                       logprobs=num_lp,
-                       prompt_logprobs=num_plp,
-                       temperature=temperature,
-                       seed=1984)
-        for num_lp, num_plp in logprob_prompt_logprob_list
-    ]
-
     vllm_results = vllm_model.model.generate(
         test_prompts, sampling_params=vllm_sampling_params)
 
@@ -267,14 +252,13 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             assert vllm_result.prompt_logprobs is None
 
 
-#@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
     hf_model,
     vllm_model,
-    batch_logprobs_composition: str,
+    batch_logprobs_composition: BatchLogprobsComposition,
     temperature: float,
     example_prompts,
 ) -> None:
@@ -292,25 +276,70 @@ def test_get_logprobs_and_prompt_logprobs(
     batch_logprobs_composition controls the logprobs configurations for
     requests in the batch under test.
 
+    APC tests run two test iterations so that cache hits occur.
+
+    To save time, only test one APC-enabled scenario
+    (sample & prompt logprobs enabled, temperature>0.0).
+    
     Args:
-      hf_model
-      vllm_model
+      hf_model: HuggingFace reference model fixture
+      vllm_model: vLLM model fixture
       batch_logprobs_composition: logprobs configuration for test batch
-      example_prompts
-      monkeypatch
+      temperature: "temperature" sampling parameter
+      example_prompts: example prompt fixture
     """
-    _test_case_get_logprobs_and_prompt_logprobs(
-        hf_model=hf_model,
-        vllm_model=vllm_model,
-        batch_logprobs_composition=batch_logprobs_composition,
-        temperature=temperature,
-        example_prompts=example_prompts)
+    do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0
+                   or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=num_lp,
+                       prompt_logprobs=num_plp,
+                       temperature=temperature,
+                       seed=1984)
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            do_apc=do_apc)
 
 
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
     Should also fail for `prompt_logprobs > max_logprobs`
+
+    APC should not matter as this test checks basic request validation.
     
     Args:
       monkeypatch
@@ -330,14 +359,12 @@ def test_max_logprobs(monkeypatch):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
+def test_none_logprobs(vllm_model, example_prompts):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
     
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
-      monkeypatch: supports editing env vars and rolling back changes
-                   after the test
     """
     max_tokens = 5
 
@@ -356,14 +383,12 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
         assert results_logprobs_none[i].prompt_logprobs is None
 
 
-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch):
+def test_zero_logprobs(vllm_model, example_prompts):
     """Engine should return sampled token and prompt token logprobs
     
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
-      monkeypatch: supports editing env vars and rolling back changes
-                   after the test
     """
     max_tokens = 5
 
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index b1862455d0ec..190927745f1f 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -42,6 +42,7 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
         min_tokens={},
         logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
+        bad_words_token_ids={},
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index b702d9ed7f83..5f041b448937 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -77,6 +77,49 @@ def _create_allowed_token_ids(
     return mask
 
 
+def _create_bad_words_token_ids(
+        batch_size: int, vocab_size: int,
+        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(vocab_size,
+                                         size=bad_words_length,
+                                         replace=True).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+def _update_output_token_ids_for_bad_words(
+        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
+                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = (bad_word_token_ids[-2] +
+                                            1) % vocab_size
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -112,6 +155,7 @@ def _create_default_sampling_metadata(
         min_tokens={},
         logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
+        bad_words_token_ids={},
     )
     return fake_sampling_metadata
 
@@ -467,3 +511,35 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
                     "inf"), f"{batch_idx}, {token_id}"
             else:
                 assert logits_for_req[token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
+def test_sampler_bad_words(device: str, batch_size: int,
+                           bad_words_lengths: list[tuple[int]]):
+    """
+    Test to verify that when the bad words restriction is present, tokens
+    are penalized based on their match with the bad words.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
+        batch_size, VOCAB_SIZE, bad_words_lengths)
+    bad_words_last_tokens = _update_output_token_ids_for_bad_words(
+        sampling_metadata, VOCAB_SIZE)
+    sampler = Sampler()
+    logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        for token_id in range(VOCAB_SIZE):
+            if (batch_idx in bad_words_last_tokens
+                    and token_id in bad_words_last_tokens[batch_idx]):
+                assert logits_for_req[token_id] == -float("inf")
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
new file mode 100644
index 000000000000..0512a1e02660
--- /dev/null
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def model() -> LLM:
+    # Disable prefix caching so that we can test prompt logprobs.
+    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
+    # is merged
+    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
+
+
+def test_n_gt_1(model):
+    """ParallelSampling is supported."""
+
+    params = SamplingParams(n=3)
+    outputs = model.generate(PROMPT, params)
+    assert len(outputs[0].outputs) == 3
+
+
+def test_best_of(model):
+    """Raise a ValueError since best_of is deprecated."""
+
+    params = SamplingParams(n=2, best_of=3)
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, params)
+
+
+def test_penalties(model):
+    """Check that we do not get errors if applied."""
+
+    params = SamplingParams(
+        temperature=1.2,
+        presence_penalty=1.2,
+        frequency_penalty=1.2,
+        repetition_penalty=1.2,
+        min_p=0.5,
+        top_p=0.5,
+        top_k=3,
+    )
+    _ = model.generate(PROMPT, params)
+
+
+def test_stop(model):
+    """Check that we respect the stop words."""
+
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    STOP_IDX = 5
+    params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
+    output = model.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should not contain the stop word.
+    assert len(new_split_text) == STOP_IDX
+
+    params = SamplingParams(temperature=0,
+                            stop=split_text[STOP_IDX],
+                            include_stop_str_in_output=True)
+    output = model.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should contain the stop word.
+    assert len(new_split_text) == STOP_IDX + 1
+
+
+def test_stop_token_ids(model):
+    """Check that we respect the stop token ids."""
+
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+
+    stop_token_id_0 = output[0].outputs[0].token_ids[5]
+    stop_token_id_1 = output[0].outputs[0].token_ids[6]
+
+    stop_token_ids = [stop_token_id_1, stop_token_id_0]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = model.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+    stop_token_ids = [stop_token_id_0, stop_token_id_1]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = model.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+
+def test_detokenize_false(model):
+    """Check that detokenize=False option works."""
+
+    output = model.generate(PROMPT, SamplingParams(detokenize=False))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    output = model.generate(
+        PROMPT, SamplingParams(detokenize=False, logprobs=3,
+                               prompt_logprobs=3))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    prompt_logprobs = output[0].prompt_logprobs
+    sampled_logprobs = output[0].outputs[0].logprobs
+    assert len(prompt_logprobs) > 1
+    assert len(sampled_logprobs) > 1
+    for all_logprobs in (prompt_logprobs[1:], sampled_logprobs):
+        for logprobs in all_logprobs:
+            assert 3 <= len(logprobs) <= 4
+            assert all(lp.decoded_token is None for lp in logprobs.values())
+
+
+def test_bad_words(model):
+    """Check that we respect bad words."""
+
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    bad_words_1 = " ".join(split_text[:2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1])
+    output = model.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+
+    bad_words_2 = new_text.split()[-1]
+    params = SamplingParams(temperature=0,
+                            bad_words=[bad_words_1, bad_words_2])
+    output = model.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+    assert bad_words_2 not in new_text
+
+
+def test_logits_processor(model):
+    """Check that we reject logits processor."""
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT,
+                           SamplingParams(logits_processors=[pick_ith]))
+
+
+def test_allowed_token_ids(model):
+    """Check that we can use allowed_token_ids."""
+
+    TOKEN_ID = 10
+    allowed_token_ids = [TOKEN_ID]
+    output = model.generate(
+        PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+    assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
+
+    # Reject empty allowed_token_ids.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+
+    # Reject negative token id.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+
+    # Reject out of vocabulary.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT,
+                           SamplingParams(allowed_token_ids=[10000000]))
+
+
+def test_priority(model):
+    """Check that we reject requests with priority."""
+
+    # Reject all allowed token ids
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, priority=[1])
+
+
+def test_seed(model):
+    """Check that seed impacts randomness."""
+
+    out_1 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_3 = model.generate(PROMPT, SamplingParams(seed=43))
+
+    assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
+    assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index c69d0d49c46f..f540895bbf14 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,27 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
+from enum import Enum
+from typing import Optional
 
 from vllm import CompletionOutput
 
 
-def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
+class BatchLogprobsComposition(Enum):
+    """Types of logprobs configs to include in test batch"""
+    NONE = 0
+    SAMPLE = 1
+    PROMPT = 2
+    SAMPLE_PROMPT = 3
+
+
+BatchLogprobsSpecType = list[tuple[Optional[int], Optional[int]]]
+
+
+def get_test_batch(
+    batch_logprobs_composition: BatchLogprobsComposition
+) -> BatchLogprobsSpecType:
     """Generate logprobs configs for a batch of requests
     
     A given request's logprobs configuration is (1) num_sample_logprobs and (2)
     num_prompt_logprobs. The batch logprobs configuration is the list of request
     logprobs configs.
 
-    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    batch_logprobs_composition == NONE yields a batch with no sample or prompt
     logprobs
 
-    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    batch_logprobs_composition == SAMPLE yields a batch with some requests
     configured for sample logprobs only, and others configured for no logprobs
 
-    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    batch_logprobs_composition == PROMPT yields a batch with some requests
     configured for prompt logprobs only, and others configured for no logprobs
 
-    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    batch_logprobs_composition == SAMPLE_PROMPT yields a batch with some
     requests configured for sample logprobs and prompt logprobs, some configured
     for only sample logprobs or only prompt logprobs, and some configured for
     no logprobs
@@ -34,10 +49,10 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
       list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
     """
-    if batch_logprobs_composition == "NONE":
+    if batch_logprobs_composition == BatchLogprobsComposition.NONE:
         # No requests with sample or prompt logprobs
         return [(None, None)]
-    elif batch_logprobs_composition == "SAMPLE":
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE:
         # Requests requiring sample logprobs or no logprobs
         return [
             (None, None),
@@ -45,7 +60,7 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
             (5, None),
             (3, None),
         ]
-    elif batch_logprobs_composition == "PROMPT":
+    elif batch_logprobs_composition == BatchLogprobsComposition.PROMPT:
         # Requests requiring prompt logprobs or no logprobs
         return [
             (None, None),
@@ -53,7 +68,7 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
             (None, 6),
             (None, 5),
         ]
-    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE_PROMPT:
         # Requests requiring either no logprobs, just
         # sample logprobs, just prompt logprobs, or
         # both sample and prompt logprobs
diff --git a/tests/v1/structured_output/__init__.py b/tests/v1/structured_output/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
new file mode 100644
index 000000000000..3aa86cbec533
--- /dev/null
+++ b/tests/v1/structured_output/test_utils.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm.v1.structured_output.utils import (
+    has_xgrammar_unsupported_json_features)
+
+
+@pytest.fixture
+def unsupported_string_schemas():
+    return [
+        {
+            "type": "string",
+            "pattern": "^[a-zA-Z]+$"
+        },
+        {
+            "type": "string",
+            "enum": ["active", "inactive", "pending"]
+        },
+        {
+            "type": "string",
+            "minLength": 1
+        },
+        {
+            "type": "string",
+            "maxLength": 100
+        },
+        {
+            "type": "string",
+            "format": "email"
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_integer_schemas():
+    return [
+        {
+            "type": "integer",
+            "minimum": 0
+        },
+        {
+            "type": "integer",
+            "maximum": 120
+        },
+        {
+            "type": "integer",
+            "exclusiveMinimum": 120
+        },
+        {
+            "type": "integer",
+            "exclusiveMaximum": 120
+        },
+        {
+            "type": "integer",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_number_schemas():
+    return [
+        {
+            "type": "number",
+            "minimum": 0
+        },
+        {
+            "type": "number",
+            "maximum": 120
+        },
+        {
+            "type": "number",
+            "exclusiveMinimum": 120
+        },
+        {
+            "type": "number",
+            "exclusiveMaximum": 120
+        },
+        {
+            "type": "number",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_array_schemas():
+    return [
+        {
+            "type": "array",
+            "uniqueItems": True
+        },
+        {
+            "type": "array",
+            "contains": {
+                "type": "string"
+            }
+        },
+        {
+            "type": "array",
+            "minContains": 1
+        },
+        {
+            "type": "array",
+            "maxContains": 5
+        },
+        {
+            "type": "array",
+            "minItems": 1
+        },
+        {
+            "type": "array",
+            "maxItems": 10
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_object_schemas():
+    return [
+        {
+            "type": "object",
+            "minProperties": 1
+        },
+        {
+            "type": "object",
+            "maxProperties": 5
+        },
+        {
+            "type": "object",
+            "propertyNames": {
+                "pattern": "^[a-z]+$"
+            }
+        },
+        {
+            "type": "object",
+            "patternProperties": {
+                "^S": {
+                    "type": "string"
+                }
+            }
+        },
+    ]
+
+
+@pytest.fixture
+def supported_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "status": {
+                "type": "string"
+            },
+            "scores": {
+                "type": "array",
+                "items": {
+                    "type": "number"
+                }
+            },
+            "address": {
+                "type": "object",
+                "properties": {
+                    "street": {
+                        "type": "string"
+                    },
+                    "city": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+
+
+@pytest.mark.parametrize("schema_type", [
+    "unsupported_string_schemas", "unsupported_integer_schemas",
+    "unsupported_number_schemas", "unsupported_array_schemas",
+    "unsupported_object_schemas"
+])
+def test_unsupported_json_features_by_type(schema_type, request):
+    schemas = request.getfixturevalue(schema_type)
+    for schema in schemas:
+        assert has_xgrammar_unsupported_json_features(
+            schema), f"Schema should be unsupported: {schema}"
+
+
+def test_supported_json_features(supported_schema):
+    assert not has_xgrammar_unsupported_json_features(
+        supported_schema), "Schema should be supported"
diff --git a/tests/v1/tpu/__init__.py b/tests/v1/tpu/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
new file mode 100644
index 000000000000..0309f545ea49
--- /dev/null
+++ b/tests/v1/tpu/test_basic.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A basic correctness check for TPUs
+
+Run `pytest tests/v1/tpu/test_basic.py`.
+"""
+import pytest
+
+from vllm.platforms import current_platform
+
+from ...conftest import VllmRunner
+
+MODELS = [
+    # "Qwen/Qwen2-7B-Instruct",
+    "meta-llama/Llama-3.1-8B",
+    # TODO: Add models here as necessary
+]
+
+TENSOR_PARALLEL_SIZES = [1]
+
+# TODO: Enable when CI/CD will have a multi-tpu instance
+# TENSOR_PARALLEL_SIZES = [1, 4]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
+def test_models(
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    prompt = "The next numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with VllmRunner(
+                model,
+                max_model_len=8192,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                max_num_seqs=16,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+    output = vllm_outputs[0][1]
+    assert "1024" in output
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 72ec73701159..192ddefe102d 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import inspect
 from typing import Optional
 
 import numpy as np
@@ -9,7 +10,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState,
+                                            InputBatch)
 
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
@@ -20,6 +22,34 @@
 MAX_NUM_PROMPT_TOKENS = 64
 
 
+def _compare_objs(obj1, obj2):
+    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
+    attr_names = set([
+        a[0] for a in attrs
+        if not (a[0].startswith('__') and a[0].endswith('__'))
+    ])
+    for attr_name in attr_names:
+        a = getattr(obj1, attr_name)
+        b = getattr(obj2, attr_name)
+
+        is_same = False
+        if isinstance(a, torch.Tensor):
+            if (a.numel() == 0 or b.numel() == 0):
+                is_same = (a.numel() == 0 and b.numel() == 0)
+            elif torch.allclose(a, b):
+                is_same = True
+        elif isinstance(a, np.ndarray):
+            if np.allclose(a, b):
+                is_same = True
+        elif isinstance(a, (BlockTable, SamplingMetadata)):
+            _compare_objs(a, b)
+            is_same = True  # if we make it here must be same
+        elif a == b:
+            is_same = True
+        assert is_same, f"Attribute {attr_name} is different"\
+            f" in {obj1} and {obj2}: {a} != {b}"
+
+
 def _remove_requests(
         input_batch: InputBatch, batch_size: int,
         reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]:
@@ -70,6 +100,7 @@ def _construct_expected_sampling_metadata(
                                          VOCAB_SIZE,
                                          dtype=torch.bool,
                                          device=device)
+    bad_words_token_ids = {}
     for req in reqs:
         if req.req_id not in req_ids_retained:
             continue
@@ -93,6 +124,8 @@ def _construct_expected_sampling_metadata(
         if req.sampling_params.allowed_token_ids:
             allowed_token_ids_mask[index_in_input_batch][
                 req.sampling_params.allowed_token_ids] = True
+        bad_words_token_ids[
+            index_in_input_batch] = req.sampling_params.bad_words_token_ids
 
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
@@ -129,6 +162,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         logit_bias=logit_bias,
         allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids=bad_words_token_ids,
     )
 
 
@@ -254,3 +288,63 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
             sampling_metadata.allowed_token_ids_mask)
+    assert expected_sampling_metadata.bad_words_token_ids == \
+        sampling_metadata.bad_words_token_ids
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("swap_list", [((0, 1), )])
+def test_swap_states_in_input_batch(device: str, batch_size: int,
+                                    swap_list: list):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_blocks_per_req=10,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+    )
+    ref_input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_blocks_per_req=10,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+    )
+
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        input_batch.add_request(req, req_index)
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    reordered_reqs = reqs.copy()
+    for swap_pair in swap_list:
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
+            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        input_batch.swap_states(swap_pair[0], swap_pair[1])
+
+    for req_index in range(batch_size):
+        req = reordered_reqs[req_index]
+        ref_input_batch.add_request(req, req_index)
+
+    input_batch.refresh_sampling_metadata()
+    ref_input_batch.refresh_sampling_metadata()
+
+    _compare_objs(input_batch, ref_input_batch)
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index ff4058a3b923..345519a07e41 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -72,6 +72,8 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
 
@@ -135,6 +137,8 @@ def test_update_states_request_finished(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids={req_id},
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -165,6 +169,8 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -190,6 +196,8 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -221,6 +229,8 @@ def test_update_states_no_changes(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -256,6 +266,8 @@ def test_update_states_request_unscheduled(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner._update_states(scheduler_output)
diff --git a/use_existing_torch.py b/use_existing_torch.py
index a578328b0357..7d352c6ca6fa 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -2,7 +2,7 @@
 
 import glob
 
-requires_files = glob.glob('requirements*.txt')
+requires_files = glob.glob('requirements/*.txt')
 requires_files += ["pyproject.toml"]
 for file in requires_files:
     print(f">>> cleaning {file}")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7a247656a987..a54800864480 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -321,6 +321,7 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                size_k: torch.SymInt,
                                is_k_full: bool,
                                has_zp: bool = False,
+                               use_atomic_add: bool = False,
                                use_fp32_reduce: bool = False,
                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
@@ -455,7 +456,7 @@ def _ggml_mul_mat_vec_a8_fake(
         quant_type: int,
         row: torch.SymInt,
     ) -> torch.Tensor:
-        return torch.empty((1, row), dtype=torch.float16, device=W.device)
+        return torch.empty((1, row), dtype=X.dtype, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_a8")
     def _ggml_mul_mat_a8_fake(
@@ -465,7 +466,7 @@ def _ggml_mul_mat_a8_fake(
         row: torch.SymInt,
     ) -> torch.Tensor:
         batch = X.size(0)
-        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
+        return torch.empty((batch, row), dtype=X.dtype, device=W.device)
 
 
 # cutlass
@@ -733,12 +734,14 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      size_k: int,
                      is_k_full: bool,
                      has_zp: bool = False,
+                     use_atomic_add: bool = False,
                      use_fp32_reduce: bool = False,
                      is_zp_float: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, b_q_type.id,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp, use_fp32_reduce, is_zp_float)
+                                         has_zp, use_atomic_add,
+                                         use_fp32_reduce, is_zp_float)
 
 
 # fp8 marlin
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 494cfc38381c..e45e1a65f890 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import List, Literal
+from typing import Literal
 
 import cv2
 import numpy as np
@@ -58,7 +58,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 
 def video_to_pil_images_list(path: str,
-                             num_frames: int = -1) -> List[Image.Image]:
+                             num_frames: int = -1) -> list[Image.Image]:
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
@@ -72,7 +72,7 @@ class VideoAsset:
     num_frames: int = -1
 
     @property
-    def pil_images(self) -> List[Image.Image]:
+    def pil_images(self) -> list[Image.Image]:
         video_path = download_video_asset(self.name)
         ret = video_to_pil_images_list(video_path, self.num_frames)
         return ret
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 290f323cf73f..33d795da636b 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -298,3 +298,7 @@ def forward(
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
+
+
+def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
+    return kv_cache_dtype != "auto"
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 9f6fc8088090..52cd44673ee5 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -8,11 +8,15 @@
 import torch
 
 from vllm import _custom_ops as ops
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
-                                              AttentionType)
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+# yapf: enable
 from vllm.attention.backends.utils import (
     PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
     compute_slot_mapping_start_idx, get_flash_attn_version,
@@ -626,6 +630,9 @@ def __init__(
         self.sliding_window = ((sliding_window - 1,
                                 0) if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttention with FP8 KV cache not yet supported")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 273c69b63ec6..5d0c23093310 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -6,7 +6,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
                                                 MLACommonMetadata,
@@ -207,6 +208,10 @@ def __init__(
                                       "are not implemented for "
                                       "FlashMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -215,8 +220,6 @@ def _forward_decode(
         attn_metadata: FlashMLAMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 FlashMLA not yet supported")
 
         decode_meta = attn_metadata.decode_metadata
         assert decode_meta is not None
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 08167ee2d64d..58e3d4fa9740 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -15,7 +15,8 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
                                                HPUPagedAttentionMetadata)
@@ -158,6 +159,10 @@ def __init__(
                                       "are not implemented for "
                                       "HPUAttentionImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "HPUAttention with FP8 KV cache not yet supported")
+
     def forward(
         self,
         layer: AttentionLayer,
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index a332e1c7135f..a550af41c392 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -9,7 +9,8 @@
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
@@ -145,7 +146,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError(
                 "IPEX backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 2b587080fc04..0d3f1154fbb0 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -226,7 +226,7 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
@@ -237,14 +237,20 @@
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
+
+from vllm.attention.ops.triton_flash_attention import triton_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
+is_hip = current_platform.is_rocm()
+
 
 class MLACommonBackend(AttentionBackend):
 
@@ -307,9 +313,10 @@ def __init__(self, runner):
         cache_config = runner.cache_config
 
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = cache_config.enable_prefix_caching
 
-        if self.chunked_prefill_enabled:
-            self.chunked_prefill_workspace_size = min(
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            self.context_chunk_workspace_size = min(
                 # Max sure there is enough for 8 full length request or at least
                 # 4 pages of cache per request
                 max(
@@ -324,7 +331,7 @@ def __init__(self, runner):
                 #   2*(192*128)*(64*1024) = 3gb
                 # (assuming 192 QK head dim, 128 heads, and fp16)
                 128 * 1024)
-            assert self.chunked_prefill_workspace_size >= \
+            assert self.context_chunk_workspace_size >= \
                 scheduler_config.max_num_seqs * cache_config.block_size
 
     @contextmanager
@@ -424,23 +431,23 @@ def prepare_graph_input_buffers(self,
                 "TritonMLAState does not support encoder/decoder yet")
 
     def begin_forward(self, model_input):
-        if self.chunked_prefill_enabled:
-            if not hasattr(self, "chunked_prefill_workspace"):
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            if not hasattr(self, "context_chunk_workspace"):
                 # not self.runner.device does not return the correct device
                 # for this process, (init_device sets the correct device but
                 # only on the Worker). The only way Ive figured out to get the
                 # correct device is to allocate the workspace on the first call
                 # to begin_forward and use the device of the input tokens
                 assert model_input.input_tokens is not None
-                self.chunked_prefill_workspace = torch.empty(
-                    (self.chunked_prefill_workspace_size,
+                self.context_chunk_workspace = torch.empty(
+                    (self.context_chunk_workspace_size,
                      self.model_config.get_head_size()),
                     dtype=self.model_config.dtype,
                     device=model_input.input_tokens.device,
                 )
 
-            model_input.attn_metadata.chunked_prefill_workspace = \
-                self.chunked_prefill_workspace
+            model_input.attn_metadata.context_chunk_workspace = \
+                self.context_chunk_workspace
 
 
 @dataclass
@@ -531,7 +538,7 @@ class MLACommonMetadata(AttentionMetadata):
     context_chunk_seq_tot: Optional[List[int]] = None
     context_chunk_max_seq_lens: Optional[List[int]] = None
     # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
-    chunked_prefill_workspace: Optional[torch.Tensor] = None
+    context_chunk_workspace: Optional[torch.Tensor] = None
 
     def __post_init__(self):
         supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
@@ -741,11 +748,13 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.block_size = input_builder.block_size
         self.chunked_prefill_enabled = \
             self.runner.scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = \
+            self.runner.cache_config.enable_prefix_caching
 
-        if self.chunked_prefill_enabled:
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
             attn_state = self.input_builder.runner.attn_state
-            self.chunked_prefill_workspace_size = \
-                attn_state.chunked_prefill_workspace_size
+            self.context_chunk_workspace_size = \
+                attn_state.context_chunk_workspace_size
             self.page_size = self.runner.block_size
 
     def prepare(self):
@@ -914,7 +923,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         context_chunk_seq_tot = None
         context_chunk_max_seq_lens = None
 
-        if self.chunked_prefill_enabled and self.num_prefills > 0 \
+        if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
+            and self.num_prefills > 0 \
             and context_lens_tensor is not None \
             and context_lens_tensor[:self.num_prefills].max() > 0:
 
@@ -930,7 +940,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # algorithm here and allocate more workspace to prefills with
             # longer context lengths
             max_context_chunk = \
-                self.chunked_prefill_workspace_size // num_prefills_with_context
+                self.context_chunk_workspace_size // num_prefills_with_context
 
             # align max_context_chunk to page_size by rounding down,
             # currently the `gather_cache` kernel cannot handle
@@ -959,7 +969,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                 chunk_seq_lens.max(dim=1).values.tolist()
             context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
             assert max(context_chunk_seq_tot) <= \
-                self.chunked_prefill_workspace_size
+                self.context_chunk_workspace_size
 
         return self.runner.attn_backend.make_metadata(
             # Required by ModelRunner
@@ -1046,12 +1056,14 @@ def __init__(
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
-        self.vllm_flash_attn_version = get_flash_attn_version()
+        self.triton_fa_func = triton_attention
+        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
         self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
         if self.vllm_flash_attn_version is not None:
             self.flash_attn_varlen_func = \
                 functools.partial(flash_attn_varlen_func,
@@ -1060,7 +1072,7 @@ def __init__(
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_UV_O):
-                output_parallel = apply_fp8_linear_generic(
+                output_parallel = self.fp8_linear_generic.apply(
                     x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape)
@@ -1080,7 +1092,7 @@ def _v_up_proj_and_o_proj(self, x):
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_Q_UK):
-                return apply_fp8_linear_generic(
+                return self.fp8_linear_generic.apply(
                     x, self.W_Q_UK, self.W_Q_UK_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape).view(
@@ -1281,8 +1293,8 @@ def _compute_prefill_context(
         # Fetch from attn_metadata directly, since it late bound by
         # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
         # any weirdness around prefill_metadata caching
-        assert attn_metadata.chunked_prefill_workspace is not None
-        workspace = attn_metadata.chunked_prefill_workspace
+        assert attn_metadata.context_chunk_workspace is not None
+        workspace = attn_metadata.context_chunk_workspace
 
         for i in range(iters):
             toks = prefill_metadata.context_chunk_seq_tot[i]
@@ -1297,7 +1309,7 @@ def _compute_prefill_context(
             )
 
             kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank].unsqueeze(1)
+                [..., :self.kv_lora_rank]
             k_pe = workspace[:toks]\
                 [..., self.kv_lora_rank:].unsqueeze(1)
 
@@ -1315,18 +1327,34 @@ def _compute_prefill_context(
                                                [0, q.shape[-1] - v.shape[-1]],
                                                value=0)
 
-            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                max_seqlen_q=prefill_metadata.max_query_len,
-                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
-                softmax_scale=self.scale,
-                causal=False,  # Context is unmasked
-                return_softmax_lse=True,
-            )
+            if is_vllm_fa:
+                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+                    q=q,
+                    k=k,
+                    v=v_padded,
+                    cu_seqlens_q=prefill_metadata.query_start_loc,
+                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                    max_seqlen_q=prefill_metadata.max_query_len,
+                    max_seqlen_k=prefill_metadata.
+                    context_chunk_max_seq_lens[i],
+                    softmax_scale=self.scale,
+                    causal=False,  # Context is unmasked
+                    return_softmax_lse=True,
+                )
+            else:
+                attn_output, attn_softmax_lse, _ = self.flash_attn_varlen_func(
+                    q=q,
+                    k=k,
+                    v=v_padded,
+                    cu_seqlens_q=prefill_metadata.query_start_loc,
+                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                    max_seqlen_q=prefill_metadata.max_query_len,
+                    max_seqlen_k=prefill_metadata.
+                    context_chunk_max_seq_lens[i],
+                    softmax_scale=self.scale,
+                    causal=False,  # Context is unmasked
+                    return_attn_probs=True,
+                )
 
             if output is None:
                 output = attn_output
@@ -1374,11 +1402,24 @@ def _forward_prefill(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        if has_context:
-            if not current_platform.is_cuda():
-                raise NotImplementedError(
-                    "Chunked Prefill for MLA is not currently supported on"
-                    "non-cuda platforms")
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
+            output = self.triton_fa_func(
+                q,
+                k,
+                v_padded,
+                None,
+                prefill_metadata.query_start_loc,
+                prefill_metadata.query_start_loc,
+                prefill_metadata.max_prefill_seq_len,
+                prefill_metadata.max_prefill_seq_len,
+                True,  # causal
+                self.scale,
+                None,  # attn_mask is None unless applying ALiBi mask
+            )
+            ## triton flash attention always return 2 objects
+            if not has_context:
+                output = output[0]
+        elif is_vllm_fa:
             output = self.flash_attn_varlen_func(
                 q=q,
                 k=k,
@@ -1389,7 +1430,7 @@ def _forward_prefill(
                 max_seqlen_k=prefill_metadata.max_prefill_seq_len,
                 softmax_scale=self.scale,
                 causal=True,
-                return_softmax_lse=True,
+                return_softmax_lse=has_context,
             )
         else:
             output = self.flash_attn_varlen_func(
@@ -1402,10 +1443,12 @@ def _forward_prefill(
                 max_seqlen_k=prefill_metadata.max_prefill_seq_len,
                 softmax_scale=self.scale,
                 causal=True,
+                return_attn_probs=has_context,
             )
 
         if has_context:
-            suffix_output, suffix_lse = output
+            # ROCm flash_attn_varlen_func will return 3 objects instead of 2
+            suffix_output, suffix_lse, *rest = output
             context_output, context_lse = self._compute_prefill_context( \
                 q, kv_c_and_k_pe_cache, attn_metadata)
 
@@ -1451,12 +1494,12 @@ def forward(
                 "output is not yet supported for MLAImplBase")
 
         if attn_metadata.is_profile_run and \
-            attn_metadata.chunked_prefill_workspace is not None:
+            attn_metadata.context_chunk_workspace is not None:
             # During the profile run try to simulate to worse case output size
             # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
             # since this can be large
             _ = torch.empty(
-                (attn_metadata.chunked_prefill_workspace.shape[0],
+                (attn_metadata.context_chunk_workspace.shape[0],
                  self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
                 device=k_c_normed.device,
                 dtype=k_c_normed.dtype,
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index f9622da42540..f7bffdec1e0b 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -8,7 +8,8 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 
 
@@ -119,7 +120,7 @@ def __init__(
             raise NotImplementedError("Alibi slopes is not supported.")
         if sliding_window is not None:
             raise NotImplementedError("Sliding window is not supported.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index dbbaa415dd12..683871a8d04d 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,11 +7,15 @@
 import torch
 from torch.nn.functional import scaled_dot_product_attention
 
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
-                                              AttentionType)
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+# yapf: enable
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
@@ -427,7 +431,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError(
                 "Torch SDPA backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 08e8226ab04c..61e5c76d9fda 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
                                                 MLACommonMetadata)
@@ -58,6 +59,10 @@ def __init__(
                                       "are not implemented for "
                                       "TritonMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -66,8 +71,6 @@ def _forward_decode(
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Triton MLA not yet supported")
 
         decode_meta = attn_metadata.decode_metadata
         assert decode_meta is not None
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 7e56ede208d3..efee108e22b0 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -604,7 +604,7 @@ def get_flash_attn_version():
         # if hopper default to FA3, otherwise stick to FA2 for now
         # TODO(lucas): profile FA3 on ampere to see if it makes sense to
         #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] >= 9:
+        if current_platform.get_device_capability()[0] == 9:
             fa_version = 3 if is_fa_version_supported(3) else 2
         else:
             fa_version = 2
@@ -612,6 +612,11 @@ def get_flash_attn_version():
         if envs.VLLM_FLASH_ATTN_VERSION is not None:
             assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
             fa_version = envs.VLLM_FLASH_ATTN_VERSION
+            if (current_platform.get_device_capability()[0] == 10
+                    and envs.VLLM_FLASH_ATTN_VERSION == 3):
+                logger.warning("Cannot use FA version 3 on Blackwell platform",
+                               "defaulting to FA version 2.")
+                fa_version = 2
 
         if not is_fa_version_supported(fa_version):
             logger.error("Cannot use FA version %d is not supported due to %s",
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index fc65f01b3c2c..ff4116b38f03 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -789,8 +789,6 @@ def _make_alibi_bias(
             dtype=dtype,
         )[:, :, :, :seq_len].copy_(bias)
         bias.mul_(alibi_slopes[:, None, None])
-        if num_heads != num_kv_heads:
-            bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
         attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
 
     return attn_biases
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 4e967c811354..8bdbc1943840 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -11,6 +11,7 @@
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -99,7 +100,8 @@ def __init__(
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
-        if quant_method is not None:
+        if quant_method is not None and not isinstance(
+                quant_method, UnquantizedLinearMethod):
             assert isinstance(quant_method, BaseKVCacheMethod)
             # TODO (mgoin): kv cache dtype should be specified in the FP8
             # checkpoint config and become the "auto" behavior
@@ -332,7 +334,7 @@ def unified_attention(
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
-    self = forward_context.attn_layers[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(self, query, key, value, kv_cache, attn_metadata,
                              fp8_out_scale)
@@ -367,7 +369,7 @@ def unified_attention_with_output(
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
-    self = forward_context.attn_layers[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(self,
                       query,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
new file mode 100644
index 000000000000..16d67e3abe84
--- /dev/null
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Authors:
+#  - Burkhard Ringlein
+#  - Jan van Lunteren
+#  - Thomas Parnell
+
+import torch
+import triton
+import triton.language as tl
+
+from .prefix_prefill import context_attention_fwd
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def kernel_paged_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.constexpr,  # int
+        query_stride_0: tl.constexpr,  # int
+        query_stride_1: tl.constexpr,  # int, should be equal to head_size
+        output_stride_0: tl.constexpr,  # int
+        output_stride_1: tl.constexpr,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        x: tl.constexpr,  # int
+        stride_k_cache_0: tl.constexpr,  # int
+        stride_k_cache_1: tl.constexpr,  # int
+        stride_k_cache_2: tl.constexpr,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_k_cache_4: tl.constexpr,  # int
+        stride_v_cache_0: tl.constexpr,  # int
+        stride_v_cache_1: tl.constexpr,  # int
+        stride_v_cache_2: tl.constexpr,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        filter_by_query_len: tl.constexpr,  # bool
+        query_start_len_ptr,  # [num_seqs+1]
+):
+    seq_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+    kv_head_idx = query_head_idx // num_queries_per_kv
+
+    if filter_by_query_len:
+        cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx +
+                                              1)
+        cur_batch_query_len = cur_batch_in_all_stop_index \
+            - cur_batch_in_all_start_index
+        if cur_batch_query_len > 1:
+            return
+    else:
+        cur_batch_in_all_start_index = seq_idx
+
+    query_offset = (cur_batch_in_all_start_index * query_stride_0 +
+                    query_head_idx * query_stride_1)
+
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # Q : (HEAD_SIZE,)
+    Q = tl.load(
+        query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED),
+        mask=dim_mask,
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([1], float("-inf"), dtype=tl.float32)
+    L = tl.full([1], 1.0, dtype=tl.float32)
+    acc = tl.zeros([HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx)
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles
+    for j in range(0, num_blocks):
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_1 +
+                    offs_d[:, None] * stride_v_cache_2 +
+                    offs_n[None, :] * stride_v_cache_3)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_1 +
+                    (offs_d[:, None] // x) * stride_k_cache_2 +
+                    offs_n[None, :] * stride_k_cache_3 +
+                    (offs_d[:, None] % x) * stride_k_cache_4)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (HEAD_SIZE, BLOCK_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        tmp = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32)
+        mask_new = tmp < boundary
+        # S : (BLOCK_SIZE,)
+        S = tl.where(mask_new, 0.0, float("-inf")).to(tl.float32)
+        S += scale * tl.sum(K * Q[:, None], axis=0)
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((seq_len - 1 - tmp) < SLIDING_WINDOW, S, -10000)
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope * (tmp - seq_len + 1)
+
+        # compute running maximum
+        # m_j : (1,)
+        m_j = tl.maximum(M, tl.max(S, axis=0))
+
+        # P : (BLOCK_SIZE,)
+        P = tl.exp(S - m_j)
+
+        # l_j : (1,)
+        l_j = tl.sum(P, axis=0)
+
+        # alpha : (1, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_SIZE,)
+        acc = acc * alpha
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_SIZE,)
+        acc += tl.sum(V * P[None, :], axis=1)
+
+    # epilogue
+    acc = acc / L
+
+    output_offset = (cur_batch_in_all_start_index * output_stride_0 +
+                     query_head_idx * output_stride_1)
+
+    tl.store(output_ptr + output_offset + tl.arange(0, HEAD_SIZE_PADDED),
+             acc,
+             mask=dim_mask)
+
+
+def chunked_prefill_paged_decode(
+    query,
+    key,
+    value,
+    output,
+    kv_cache_dtype,
+    key_cache,
+    value_cache,
+    block_table,
+    query_start_loc,
+    seq_lens,
+    max_query_len,
+    k_scale,
+    v_scale,
+    alibi_slopes=None,
+    sliding_window=None,
+    sm_scale=None,
+):
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (query.shape[1]**0.5)
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if max_query_len > 1:
+        context_attention_fwd(
+            q=query,
+            k=key,
+            v=value,
+            o=output,
+            kv_cache_dtype=kv_cache_dtype,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            b_loc=block_table,
+            b_start_loc=query_start_loc,
+            b_seq_len=seq_lens,
+            max_input_len=max_query_len,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            alibi_slopes=alibi_slopes,
+            sliding_window=sliding_window,
+            sm_scale=sm_scale,
+            skip_decode=True,
+        )
+
+    block_size = value_cache.shape[3]
+    num_seqs = len(seq_lens)
+    num_query_heads = query.shape[1]
+    num_queries_per_kv = query.shape[1] // key.shape[1]
+    head_size = query.shape[2]
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert key_cache.dtype == torch.uint8
+        assert value_cache.dtype == torch.uint8
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = torch.float8_e4m3fn
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        key_cache = key_cache.view(target_dtype)
+        value_cache = value_cache.view(target_dtype)
+
+    kernel_paged_attention_2d[(
+        num_seqs,
+        num_query_heads,
+    )](
+        output_ptr=output,
+        query_ptr=query,
+        key_cache_ptr=key_cache,
+        value_cache_ptr=value_cache,
+        block_tables_ptr=block_table,
+        seq_lens_ptr=seq_lens,
+        alibi_slopes_ptr=alibi_slopes,
+        scale=sm_scale,
+        k_scale=k_scale,
+        v_scale=v_scale,
+        num_query_heads=num_query_heads,
+        num_queries_per_kv=num_queries_per_kv,
+        block_table_stride=block_table.stride(0),
+        query_stride_0=query.stride(0),
+        query_stride_1=query.stride(1),
+        output_stride_0=output.stride(0),
+        output_stride_1=output.stride(1),
+        BLOCK_SIZE=block_size,
+        HEAD_SIZE=head_size,
+        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+        USE_ALIBI_SLOPES=use_alibi_slopes,
+        SLIDING_WINDOW=sliding_window,
+        x=key_cache.shape[4],
+        stride_k_cache_0=key_cache.stride(0),
+        stride_k_cache_1=key_cache.stride(1),
+        stride_k_cache_2=key_cache.stride(2),
+        stride_k_cache_3=key_cache.stride(3),
+        stride_k_cache_4=key_cache.stride(4),
+        stride_v_cache_0=value_cache.stride(0),
+        stride_v_cache_1=value_cache.stride(1),
+        stride_v_cache_2=value_cache.stride(2),
+        stride_v_cache_3=value_cache.stride(3),
+        filter_by_query_len=True,
+        query_start_len_ptr=query_start_loc,
+    )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 14ebe267e0b1..4b20a9355af7 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -64,7 +64,9 @@ def _fwd_kernel(
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
         SLIDING_WINDOW: tl.constexpr,
+        SKIP_DECODE: tl.constexpr,
     ):
+
         cur_batch = tl.program_id(0)
         cur_head = tl.program_id(1)
         start_m = tl.program_id(2)
@@ -78,6 +80,9 @@ def _fwd_kernel(
                                cur_batch_in_all_start_index)
         cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
+        if SKIP_DECODE and cur_batch_query_len == 1:
+            return
+
         # start position inside of the query
         # generally, N goes over kv, while M goes over query_len
         block_start_loc = BLOCK_M * start_m
@@ -500,6 +505,7 @@ def _fwd_kernel_alibi(
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
+        SKIP_DECODE: tl.constexpr,
     ):
         # attn_bias[]
         cur_batch = tl.program_id(0)
@@ -518,6 +524,9 @@ def _fwd_kernel_alibi(
                                cur_batch_in_all_start_index)
         cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
+        if SKIP_DECODE and cur_batch_query_len == 1:
+            return
+
         block_start_loc = BLOCK_M * start_m
 
         # initialize offsets
@@ -721,7 +730,8 @@ def context_attention_fwd(q,
                               v_scale: torch.Tensor,
                               alibi_slopes=None,
                               sliding_window=None,
-                              sm_scale=None):
+                              sm_scale=None,
+                              skip_decode=False):
 
         q_dtype_is_f32 = q.dtype is torch.float32
         # need to reduce num. blocks when using fp32
@@ -825,6 +835,7 @@ def context_attention_fwd(q,
                 BLOCK_DMODEL=Lk,
                 BLOCK_DMODEL_PADDED=Lk_padded,
                 BLOCK_N=BLOCK,
+                SKIP_DECODE=skip_decode,
                 num_warps=NUM_WARPS,
                 num_stages=1,
             )
@@ -877,6 +888,7 @@ def context_attention_fwd(q,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
             SLIDING_WINDOW=sliding_window,
+            SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b972f03c9685..afb63cf8319f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -396,8 +396,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         cache_dir = self.compilation_config.cache_dir
         os.makedirs(cache_dir, exist_ok=True)
-        local_cache_dir = os.path.join(
-            cache_dir, f"rank_{vllm_config.parallel_config.rank}")
+        rank = vllm_config.parallel_config.rank
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
         self.compilation_config.local_cache_dir = local_cache_dir
 
         disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index ac0544ad6403..d280fdfbe0d6 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -155,6 +155,7 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
         triton_cache = os.path.join(cache_dir, "triton_cache")
         os.makedirs(triton_cache, exist_ok=True)
         os.environ["TRITON_CACHE_DIR"] = triton_cache
+        self.cache_dir = cache_dir
 
     def compile(
         self,
@@ -200,7 +201,19 @@ def compile(
             def hijack_load(*args, **kwargs):
                 inductor_compiled_graph = original_load(*args, **kwargs)
                 nonlocal file_path
-                file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                compiled_fn = inductor_compiled_graph.current_callable
+                file_path = compiled_fn.__code__.co_filename  # noqa
+                if not file_path.startswith(self.cache_dir):
+                    # hooked in the align_inputs_from_check_idxs function
+                    # in torch/_inductor/utils.py
+                    for cell in compiled_fn.__closure__:
+                        if not callable(cell.cell_contents):
+                            continue
+                        if cell.cell_contents.__code__.co_filename.startswith(
+                                self.cache_dir):
+                            # this is the real file path compiled from Inductor
+                            file_path = cell.cell_contents.__code__.co_filename
+                            break
                 return inductor_compiled_graph
 
             hijacked_compile_fx_inner = torch._inductor.compile_fx.compile_fx_inner  # noqa
diff --git a/vllm/config.py b/vllm/config.py
index 818138ee38ff..d99971670ab7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -256,7 +256,7 @@ def __init__(
         override_neuron_config: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional["PoolerConfig"] = None,
         logits_processor_pattern: Optional[str] = None,
-        generation_config: Optional[str] = None,
+        generation_config: str = "auto",
         enable_sleep_mode: bool = False,
         override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
@@ -757,7 +757,7 @@ def verify_with_parallel_config(
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        if envs.VLLM_TEST_ENABLE_EP:
+        if parallel_config.enable_expert_parallel:
             self._verify_with_expert_parallelism()
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
@@ -954,7 +954,7 @@ def get_multimodal_config(self) -> "MultiModalConfig":
         return self.multimodal_config
 
     def try_get_generation_config(self) -> dict[str, Any]:
-        if self.generation_config is None or self.generation_config == "auto":
+        if self.generation_config in ("auto", "vllm"):
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
@@ -974,17 +974,14 @@ def try_get_generation_config(self) -> dict[str, Any]:
     def get_diff_sampling_param(self) -> dict[str, Any]:
         """
         This method returns a dictionary containing the parameters
-        that differ from the default sampling parameters, but only
-        if `generation_config` is set. If `generation_config` is not
-        set, an empty dictionary is returned.
+        that differ from the default sampling parameters. If
+        `generation_config` is `"vllm"`, an empty dictionary is returned.
 
         Returns:
             dict[str, Any]: A dictionary with the differing sampling
-            parameters if `generation_config` is set, otherwise an
-            empty dictionary.
+            parameters, if `generation_config` is `"vllm"` an empty dictionary.
         """
-        if self.generation_config is None:
-            # When generation_config is not set
+        if self.generation_config == "vllm":
             config = {}
         else:
             config = self.try_get_generation_config()
@@ -1283,6 +1280,8 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.
+        use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
+            loading. Default to True
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -1290,6 +1289,7 @@ class LoadConfig:
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
     ignore_patterns: Optional[Union[list[str], str]] = None
+    use_tqdm_on_load: bool = True
 
     def compute_hash(self) -> str:
         """
@@ -1337,6 +1337,7 @@ class ParallelConfig:
     # IP of the data parallel master.
     data_parallel_master_ip: str = "127.0.0.1"
     data_parallel_master_port: int = 29500  # Port of the data parallel master.
+    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
 
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
@@ -1369,6 +1370,7 @@ class ParallelConfig:
     # will be determined based on the platform.
     worker_cls: str = "auto"
     sd_worker_cls: str = "auto"
+    worker_extension_cls: str = ""
 
     # world_size is TPxPP, it affects the number of workers we create.
     world_size: int = field(init=False)
@@ -1550,6 +1552,9 @@ def _verify_args(self) -> None:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
 
+        assert isinstance(self.worker_extension_cls, str), (
+            "worker_extension_cls must be a string (qualified class name).")
+
 
 @dataclass
 class SchedulerConfig:
@@ -2311,9 +2316,9 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        # Setting the maximum rank to 256 should be able to satisfy the vast
+        # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256)
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
         possible_lora_extra_vocab_size = (0, 256, 512)
         if self.max_lora_rank not in possible_max_ranks:
             raise ValueError(
@@ -2427,6 +2432,15 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
+    def get_limit_per_prompt(self, modality: str) -> int:
+        """
+        Get the maximum number of input items allowed per prompt
+        for the given modality.
+
+        If not set by the user, this defaults to `1`.
+        """
+        return self.limit_per_prompt.get(modality, 1)
+
     # TODO: Add configs to init vision tower or not.
 
 
@@ -3436,6 +3450,10 @@ def __post_init__(self):
             # FIXME(woosuk): Disable inductor to reduce the compilation time
             # and avoid any potential issues with the inductor.
             self.compilation_config.custom_ops = ["none"]
+            if current_platform.is_rocm():
+                self.compilation_config.custom_ops = [
+                    "+rms_norm", "+silu_and_mul"
+                ]
             self.compilation_config.use_cudagraph = True
             self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
@@ -3460,9 +3478,9 @@ def __post_init__(self):
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
         if self.model_config and self.model_config.use_mla and \
-            not current_platform.is_cuda():
+            not (current_platform.is_cuda() or current_platform.is_rocm()):
             logger.info(
-                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "MLA is enabled on a non-GPU platform; forcing chunked "
                 "prefill and prefix caching to be disabled.")
             self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.chunked_prefill_enabled = False
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 3cdad496e843..e93143c83d9f 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -16,8 +16,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata, SequenceGroupMetadataDelta,
-                           SequenceStage, SequenceStatus)
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupMetadataDelta, SequenceStage,
+                           SequenceStatus)
 from vllm.utils import Device, PyObjectCache
 
 logger = init_logger(__name__)
@@ -561,7 +562,11 @@ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
         # Only for testing purposes.
         self.swapped.append(seq_group)
 
-    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+    def abort_seq_group(
+        self,
+        request_id: Union[str, Iterable[str]],
+        seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
+    ) -> None:
         """Aborts a sequence group with the given ID.
 
         Check if the sequence group with the given ID
@@ -573,21 +578,29 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
 
         Args:
             request_id: The ID(s) of the sequence group to abort.
+            seq_id_to_seq_group: helper for groups with n>1
         """
         if isinstance(request_id, str):
             request_id = (request_id, )
         request_ids = set(request_id)
+        seq_id_to_seq_group = seq_id_to_seq_group or {}
         for state_queue in [self.waiting, self.running, self.swapped]:
             aborted_groups: List[SequenceGroup] = []
             for seq_group in state_queue:
-                if not request_ids:
-                    # Using 'break' here may add two extra iterations,
-                    # but is acceptable to reduce complexity.
-                    break
-                if seq_group.request_id in request_ids:
+                # When n>1, seq_group.request_id looks like
+                # foo_parallel_sample_0, while request_ids is just foo, and we
+                # should resolve it as real_request_id to match.
+                if seq_group.request_id in seq_id_to_seq_group:
+                    real_request_id = seq_id_to_seq_group[
+                        seq_group.request_id].group_id
+                else:
+                    real_request_id = seq_group.request_id
+                if real_request_id in request_ids:
                     # Appending aborted group into pending list.
                     aborted_groups.append(seq_group)
-                    request_ids.remove(seq_group.request_id)
+                    # We can't remove real_request_id in request_ids here,
+                    # because there may be other seq groups sharing the same
+                    # real_request_id
             for aborted_group in aborted_groups:
                 # Remove the sequence group from the state queue.
                 state_queue.remove(aborted_group)
@@ -598,6 +611,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                         continue
                     seq.status = SequenceStatus.FINISHED_ABORTED
                     self.free_seq(seq)
+                if aborted_group.request_id in seq_id_to_seq_group:
+                    del seq_id_to_seq_group[aborted_group.request_id]
 
                 self._free_seq_group_cross_attn_blocks(aborted_group)
 
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 7f63fc143787..0291fd9e1c88 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
@@ -140,6 +141,12 @@ def get_instance() -> "CuMemAllocator":
         return CuMemAllocator.instance
 
     def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+
         self.pointer_to_data: Dict[int, AllocationData] = {}
         self.current_tag: str = CuMemAllocator.default_tag
         self.allocator_and_pools: Dict[str, Any] = {}
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
index bf9117133af5..42de227b6c30 100644
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -38,7 +38,8 @@ def __init__(
         from lmcache.integration.vllm.utils import ENGINE_NAME
         from lmcache.integration.vllm.vllm_adapter import (
             RetrieveStatus, StoreStatus, init_lmcache_engine,
-            lmcache_retrieve_kv, lmcache_should_store, lmcache_store_kv)
+            lmcache_retrieve_kv, lmcache_should_retrieve, lmcache_should_store,
+            lmcache_store_kv)
         logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
                     self.transfer_config)
 
@@ -54,6 +55,7 @@ def __init__(
         self.cache_config = config.cache_config
         self.lmcache_retrieve_kv = lmcache_retrieve_kv
         self.lmcache_store_kv = lmcache_store_kv
+        self.lmcache_should_retrieve = lmcache_should_retrieve
         self.lmcache_should_store = lmcache_should_store
         self.store_status = StoreStatus
         self.retrieve_status = RetrieveStatus
@@ -65,15 +67,11 @@ def recv_kv_caches_and_hidden_states(
     ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
                "ModelInputForGPUWithSamplingMetadata"]:
 
-        hidden_or_intermediate_states = None
-
-        # TODO (Jiayi): Need to support chunked prefill
-        retrieve_status = self.retrieve_status.PREFILL
-
-        model_input, bypass_model_exec = self.lmcache_retrieve_kv(
-            model_executable, model_input, self.cache_config, kv_caches,
-            retrieve_status)
-
+        retrieve_status = self.lmcache_should_retrieve(model_input)
+        model_input, bypass_model_exec, hidden_or_intermediate_states =\
+            self.lmcache_retrieve_kv(
+                model_executable, model_input, self.cache_config, kv_caches,
+                retrieve_status)
         return hidden_or_intermediate_states, bypass_model_exec, model_input
 
     def send_kv_caches_and_hidden_states(
@@ -84,15 +82,7 @@ def send_kv_caches_and_hidden_states(
         hidden_or_intermediate_states: Union[torch.Tensor,
                                              IntermediateTensors],
     ) -> None:
-        num_reqs = 0
-        seq_group_list = model_input.sampling_metadata.seq_groups
-        assert seq_group_list is not None
-        for seq_group in seq_group_list:
-            seq_ids = seq_group.seq_ids
-            for seq_id in seq_ids:
-                num_reqs += 1
-
-        # TODO (Jiayi): Only normal prefill is supported for now
+
         store_status = self.lmcache_should_store(model_input)
         self.lmcache_store_kv(
             self.model_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 8e2fbf36b4de..7315a6f45f7d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -2,7 +2,7 @@
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
-The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
+The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache
 producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
 MooncakePipe.
 
@@ -159,6 +159,7 @@ def send_kv_caches_and_hidden_states(
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
@@ -166,7 +167,8 @@ def send_kv_caches_and_hidden_states(
         num_heads = int(model_config.num_key_value_heads / self.tp_size)
         hidden_size = model_config.hidden_size
         num_attention_heads = model_config.num_attention_heads
-        head_size = int(hidden_size / num_attention_heads)
+        head_size = getattr(model_config, "head_dim",
+                            int(hidden_size // num_attention_heads))
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -174,6 +176,15 @@ def send_kv_caches_and_hidden_states(
         for idx, slen in enumerate(seq_lens):
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You have some decode requests while using "
+                               "SimpleConnector. Their KVCache won't be sent.")
+                break
+
             current_tokens = input_tokens_tensor[start_pos:end_pos]
 
             keys, values = [], []
@@ -236,7 +247,7 @@ def recv_kv_caches_and_hidden_states(
                 # - input_tokens[num_prefill_tokens:] contains decode tokens.
                 logger.warning("You should set --enable_chunked_prefill=False "
                                "and --max_num_batched_tokens "
-                               "should be equal to max_seq_len_to_capture")
+                               "should be equal to --max_seq_len_to_capture")
                 bypass_model_exec = False
                 assert start_pos == num_prefill_tokens
                 break
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 57a2b0393ba4..ec46d4045447 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -2,13 +2,14 @@
 
 import json
 import os
-import pickle
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import Optional, Union
 
 import torch
 import zmq
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
 
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
@@ -237,14 +238,13 @@ def tensor_hash(self, tensor: torch.Tensor) -> int:
         return hash(tensor.data_ptr())
 
     def _send_impl(self, tensor: torch.Tensor) -> None:
-        """Implement the tensor sending logic."""
-        value_bytes = pickle.dumps(tensor)
-        self.transfer_engine.send_bytes(value_bytes)
+        """Implement the tensor sending logic using safetensors."""
+        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
 
     def _recv_impl(self) -> torch.Tensor:
-        """Implement the tensor receiving logic."""
+        """Implement the tensor receiving logic using safetensors."""
         data = self.transfer_engine.recv_bytes()
-        return pickle.loads(data)
+        return safetensors_load(data)["tensor"].to(self.device)
 
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """Send tensor to the target process."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 989eb4dbfd14..0d285acd15f3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -104,7 +104,7 @@ class EngineArgs:
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
-    seed: int = 0
+    seed: Optional[int] = None
     max_model_len: Optional[int] = None
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
@@ -114,6 +114,7 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
+    enable_expert_parallel: bool = False
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
@@ -202,10 +203,11 @@ class EngineArgs:
     override_pooler_config: Optional[PoolerConfig] = None
     compilation_config: Optional[CompilationConfig] = None
     worker_cls: str = "auto"
+    worker_extension_cls: str = ""
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
-    generation_config: Optional[str] = None
+    generation_config: Optional[str] = "auto"
     override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
     model_impl: str = "auto"
@@ -215,6 +217,7 @@ class EngineArgs:
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
     reasoning_parser: Optional[str] = None
+    use_tqdm_on_load: bool = True
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -274,7 +277,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
-            help='Skip initialization of tokenizer and detokenizer.')
+            help='Skip initialization of tokenizer and detokenizer. '
+            'Expects valid prompt_token_ids and None for prompt from '
+            'the input. The generated output will contain token ids.')
         parser.add_argument(
             '--revision',
             type=nullable_str,
@@ -439,6 +444,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
                             help='Number of tensor parallel replicas.')
+        parser.add_argument(
+            '--enable-expert-parallel',
+            action='store_true',
+            help='Use expert parallelism instead of tensor parallelism '
+            'for MoE layers.')
         parser.add_argument(
             '--max-parallel-loading-workers',
             type=int,
@@ -742,6 +752,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=1,
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
+        parser.add_argument(
+            '--use-tqdm-on-load',
+            dest='use_tqdm_on_load',
+            action=argparse.BooleanOptionalAction,
+            default=EngineArgs.use_tqdm_on_load,
+            help='Whether to enable/disable progress bar '
+            'when loading model weights.',
+        )
 
         parser.add_argument(
             '--multi-step-stream-outputs',
@@ -1015,16 +1033,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default="auto",
             help='The worker class to use for distributed execution.')
+        parser.add_argument(
+            '--worker-extension-cls',
+            type=str,
+            default="",
+            help='The worker extension class on top of the worker cls, '
+            'it is useful if you just want to add new functions to the worker '
+            'class without changing the existing functions.')
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
-            default=None,
+            default="auto",
             help="The folder path to the generation config. "
-            "Defaults to None, no generation config is loaded, vLLM defaults "
-            "will be used. If set to 'auto', the generation config will be "
-            "loaded from model path. If set to a folder path, the generation "
-            "config will be loaded from the specified folder path. If "
-            "`max_new_tokens` is specified in generation config, then "
+            "Defaults to 'auto', the generation config will be loaded from "
+            "model path. If set to 'vllm', no generation config is loaded, "
+            "vLLM defaults will be used. If set to a folder path, the "
+            "generation config will be loaded from the specified folder path. "
+            "If `max_new_tokens` is specified in generation config, then "
             "it sets a server-wide limit on the number of output tokens "
             "for all requests.")
 
@@ -1163,6 +1188,7 @@ def create_load_config(self) -> LoadConfig:
             download_dir=self.download_dir,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
+            use_tqdm_on_load=self.use_tqdm_on_load,
         )
 
     def create_engine_config(self,
@@ -1199,6 +1225,7 @@ def create_engine_config(self,
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             tokenizer_pool_config=TokenizerPoolConfig.create_config(
@@ -1209,6 +1236,7 @@ def create_engine_config(self,
             ray_workers_use_nsight=self.ray_workers_use_nsight,
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
+            worker_extension_cls=self.worker_extension_cls,
         )
 
         max_model_len = model_config.max_model_len
@@ -1450,15 +1478,15 @@ class AsyncEngineArgs(EngineArgs):
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
                      async_args_only: bool = False) -> FlexibleArgumentParser:
+        # Initialize plugin to update the parser, for example, The plugin may
+        # adding a new kind of quantization method to --quantization argument or
+        # a new device to --device argument.
+        load_general_plugins()
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
-        # Initialize plugin to update the parser, for example, The plugin may
-        # adding a new kind of quantization method to --quantization argument or
-        # a new device to --device argument.
-        load_general_plugins()
         from vllm.platforms import current_platform
         current_platform.pre_register_and_update(parser)
         return parser
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e17f1eccc22f..b76153d3469f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -887,7 +887,8 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             >>> engine.abort_request(request_id)
         """
         for scheduler in self.scheduler:
-            scheduler.abort_seq_group(request_id)
+            scheduler.abort_seq_group(
+                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
 
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
@@ -1354,6 +1355,11 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
 
             finished_requests_ids = self.scheduler[
                 virtual_engine].get_and_reset_finished_requests_ids()
+            # When n>1, elements in self.seq_id_to_seq_group should be deleted
+            # here, otherwise memory leaks.
+            for finished_request_id in finished_requests_ids:
+                if finished_request_id in self.seq_id_to_seq_group:
+                    del self.seq_id_to_seq_group[finished_request_id]
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index cb3ca7a11881..70f36d1290ca 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -74,31 +74,51 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             ],
             multiprocess_mode="livemostrecent",
         )
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
-            documentation="Number of requests swapped to CPU.",
+            documentation=(
+                "Number of requests swapped to CPU. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_cache_usage = self._gauge_cls(
             name="vllm:cpu_cache_usage_perc",
-            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            documentation=(
+                "CPU KV-cache usage. 1 means 100 percent usage. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
-        #   Prefix caching block hit rate
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:cpu_prefix_cache_hit_rate",
-            documentation="CPU prefix cache block hit rate.",
+            documentation=(
+                "CPU prefix cache block hit rate. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - replaced by queries+hits counters in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:gpu_prefix_cache_hit_rate",
-            documentation="GPU prefix cache block hit rate.",
+            documentation=("GPU prefix cache block hit rate. "
+                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
+                           "vllm:gpu_prefix_cache_queries in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
 
@@ -115,10 +135,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        self.counter_tokens = self._counter_cls(
-            name="vllm:tokens_total",
-            documentation="Number of prefill plus generation tokens processed.",
-            labelnames=labelnames)
         buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
         if not vllm_config.model_config.enforce_eager:
             buckets = vllm_config.compilation_config.\
@@ -181,24 +197,35 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             "Histogram of time spent in DECODE phase for request.",
             labelnames=labelnames,
             buckets=request_latency_buckets)
+        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_time_in_queue_request = self._histogram_cls(
             name="vllm:time_in_queue_requests",
-            documentation=
-            "Histogram of time the request spent in the queue in seconds.",
+            documentation=(
+                "Histogram of time the request spent in the queue in seconds. "
+                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
             labelnames=labelnames,
             buckets=request_latency_buckets)
+
+        # Deprecated in 0.8 - use prefill/decode/inference time metrics
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_model_forward_time_request = self._histogram_cls(
             name="vllm:model_forward_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model forward pass in ms.",
+            documentation=(
+                "Histogram of time spent in the model forward pass in ms. "
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
         self.histogram_model_execute_time_request = self._histogram_cls(
             name="vllm:model_execute_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model execute function in ms.",
+            documentation=(
+                "Histogram of time spent in the model execute function in ms."
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
+
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
             name="vllm:request_prompt_tokens",
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index efea6ee2c69a..5d14b4112a86 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -292,6 +292,8 @@ def _handle_process_request(self, request: RPCProcessRequest):
             # We do not set self._errored = True here, since the error
             # is due to an issue adding this request to the engine,
             # rather than an issue with the engine itself.
+            logger.debug("Failed to add request %s to engine. %s",
+                         request.request_id, e)
             is_errored = self._errored_with is not None
             rpc_err = RPCError(request_id=request_id,
                                is_engine_errored=is_errored,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b05842dd27d3..b51ade17def6 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -56,6 +56,17 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
+    image_embeds: Required[Union[str, dict[str, str]]]
+    """
+    The image embeddings. It can be either:
+    - A single base64 string.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["image_embeds"]]
+    """The type of the content part."""
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -109,6 +120,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
+    ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
     CustomChatCompletionContentSimpleVideoParam, str]
 
@@ -350,7 +362,7 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-ModalityStr = Literal["image", "audio", "video"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds"]
 _T = TypeVar("_T")
 
 
@@ -391,10 +403,12 @@ def _placeholder_str(self, modality: ModalityStr,
         hf_config = self._model_config.hf_config
         model_type = hf_config.model_type
 
-        if modality == "image":
+        if modality in ["image", "image_embeds"]:
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
+            if model_type == "phi4mm":
+                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
@@ -424,6 +438,8 @@ def _placeholder_str(self, modality: ModalityStr,
         elif modality == "audio":
             if model_type == "ultravox":
                 return "<|audio|>"
+            if model_type == "phi4mm":
+                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
@@ -466,10 +482,27 @@ def create_parser(self) -> "BaseMultiModalContentParser":
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items_by_modality:
-            return dict(self._items_by_modality)
-
-        return None
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = dict(self._items_by_modality)
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(\
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        elif "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        elif "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        elif "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self)
@@ -478,13 +511,31 @@ def create_parser(self) -> "BaseMultiModalContentParser":
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items_by_modality:
-            return {
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = {
                 modality: await asyncio.gather(*items)
                 for modality, items in self._items_by_modality.items()
             }
 
-        return None
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        elif "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        elif "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        elif "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return AsyncMultiModalContentParser(self)
@@ -509,6 +560,11 @@ def mm_placeholder_counts(self) -> dict[str, int]:
     def parse_image(self, image_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
@@ -539,6 +595,21 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            placeholder = self._tracker.add("image_embeds", embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.fetch_image_embedding(image_embeds)
+            placeholder = self._tracker.add("image_embeds", embedding)
+
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
@@ -575,6 +646,25 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
+
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            future.set_result(embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.\
+                fetch_image_embedding(image_embeds)
+            future.set_result(embedding)
+
+        placeholder = self._tracker.add("image_embeds", future)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
@@ -680,6 +770,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
+_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
@@ -696,6 +787,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
+    "image_embeds":
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", {}),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
     "input_audio":
@@ -765,6 +858,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
+                                       "image_embeds",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -839,7 +933,10 @@ def _parse_chat_message_content_part(
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
-
+    if part_type == "image_embeds":
+        content = cast(Union[str, dict[str, str]], content)
+        mm_parser.parse_image_embeds(content)
+        return {'type': 'image'} if wrap_dicts else None
     if part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 122e2ed86cb6..6c2e87416b94 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -100,7 +100,8 @@ class LLM:
             This can be used for temporarily storing the states of the requests
             when their `best_of` sampling parameters are larger than 1. If all
             requests will have `best_of=1`, you can safely set this to 0.
-            Otherwise, too small values may cause out-of-memory (OOM) errors.
+            Noting that `best_of` is only supported in V0. Otherwise, too small
+            values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
@@ -168,7 +169,7 @@ def __init__(
         quantization: Optional[str] = None,
         revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
-        seed: int = 0,
+        seed: Optional[int] = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
@@ -244,6 +245,7 @@ def __init__(
             engine_args, usage_context=UsageContext.LLM_CLASS)
 
         self.request_counter = Counter()
+        self.default_sampling_params: Union[dict[str, Any], None] = None
 
     @staticmethod
     def get_engine_class() -> type[LLMEngine]:
@@ -268,10 +270,11 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
     def get_default_sampling_params(self) -> SamplingParams:
-        diff_sampling_param = (
-            self.llm_engine.model_config.get_diff_sampling_param())
-        if diff_sampling_param:
-            return SamplingParams.from_optional(**diff_sampling_param)
+        if self.default_sampling_params is None:
+            self.default_sampling_params = (
+                self.llm_engine.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            return SamplingParams.from_optional(**self.default_sampling_params)
         return SamplingParams()
 
     @overload
@@ -1391,7 +1394,9 @@ def _run_engine(
                             pbar.postfix = (
                                 f"est. speed input: {in_spd:.2f} toks/s, "
                                 f"output: {out_spd:.2f} toks/s")
-                        pbar.update(1)
+                            pbar.update(len(output.outputs))
+                        else:
+                            pbar.update(1)
 
         if use_tqdm:
             pbar.close()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 14ce71cd3c2e..6b519e1b7041 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -369,6 +369,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
 
     # doc: end-chat-completion-extra-params
 
@@ -739,6 +745,12 @@ class CompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
 
     # doc: end-completion-extra-params
 
@@ -1277,6 +1289,21 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+class TranscriptionResponseStreamChoice(OpenAIBaseModel):
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class TranscriptionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
+    object: Literal["transcription.chunk"] = "transcription.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[TranscriptionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
@@ -1502,6 +1529,15 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
+    stream: Optional[bool] = False
+    """Custom field not present in the original OpenAI definition. When set, 
+    it will enable output to be streamed in a similar fashion as the Chat
+    Completion endpoint. 
+    """
+    # Flattened stream option to simplify form data.
+    stream_include_usage: Optional[bool] = False
+    stream_continuous_usage_stats: Optional[bool] = False
+
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
         "temperature": 0,
@@ -1522,7 +1558,21 @@ def to_sampling_params(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
 
         return SamplingParams.from_optional(temperature=temperature,
-                                            max_tokens=max_tokens)
+                                            max_tokens=max_tokens,
+                                            output_kind=RequestOutputKind.DELTA
+                                            if self.stream \
+                                            else RequestOutputKind.FINAL_ONLY)
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
+        stream = data.get("stream", False)
+        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
 
 
 # Transcription response objects
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98e9ea0fc61a..1ba33f78cde7 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -24,7 +24,8 @@
     RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
                                                        ReasoningParserManager)
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
@@ -105,10 +106,13 @@ def __init__(
                                 "been registered") from e
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
-            logger.info("Overwriting default chat sampling param with: %s",
-                        diff_sampling_param)
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, self.default_sampling_params)
 
     async def create_chat_completion(
         self,
@@ -210,17 +214,14 @@ async def create_chat_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
-                # Build default sampling params
-                default_sampling_params = (
-                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
                         self.model_config.logits_processor_pattern,
-                        default_sampling_params)
+                        self.default_sampling_params)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
@@ -451,6 +452,8 @@ async def chat_completion_stream_generator(
                             top_logprobs=output.logprobs,
                             tokenizer=tokenizer,
                             num_output_top_logprobs=request.top_logprobs,
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -706,6 +709,7 @@ async def chat_completion_full_generator(
                     top_logprobs=out_logprobs,
                     num_output_top_logprobs=request.top_logprobs,
                     tokenizer=tokenizer,
+                    return_as_token_id=request.return_tokens_as_token_ids,
                 )
             else:
                 logprobs = None
@@ -846,20 +850,21 @@ async def chat_completion_full_generator(
             model=model_name,
             choices=choices,
             usage=usage,
-            prompt_logprobs=final_res.prompt_logprobs,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
         )
 
         return response
 
     def _get_top_logprobs(
             self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
-            tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
+            tokenizer: AnyTokenizer,
+            should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
         return [
             ChatCompletionLogProb(token=(token := self._get_decoded_token(
                 p[1],
                 p[0],
                 tokenizer,
-                return_as_token_id=self.return_tokens_as_token_ids)),
+                return_as_token_id=should_return_as_token_id)),
                                   logprob=max(p[1].logprob, -9999.0),
                                   bytes=list(
                                       token.encode("utf-8", errors="replace")))
@@ -873,15 +878,18 @@ def _create_chat_logprobs(
         top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
         tokenizer: AnyTokenizer,
         num_output_top_logprobs: Optional[int] = None,
+        return_as_token_id: Optional[bool] = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
         logprobs_content: list[ChatCompletionLogProbsContent] = []
 
+        should_return_as_token_id = return_as_token_id if \
+            return_as_token_id is not None else self.return_tokens_as_token_ids
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
-                if self.return_tokens_as_token_ids:
+                if should_return_as_token_id:
                     token = f"token_id:{token_id}"
 
                 logprobs_content.append(
@@ -899,16 +907,14 @@ def _create_chat_logprobs(
                             step_token,
                             token_id,
                             tokenizer,
-                            self.return_tokens_as_token_ids,
+                            should_return_as_token_id,
                         ),
                         logprob=max(step_token.logprob, -9999.0),
                         bytes=None if step_decoded is None else list(
                             step_decoded.encode("utf-8", errors="replace")),
                         top_logprobs=self._get_top_logprobs(
-                            step_top_logprobs,
-                            num_output_top_logprobs,
-                            tokenizer,
-                        ),
+                            step_top_logprobs, num_output_top_logprobs,
+                            tokenizer, should_return_as_token_id),
                     ))
 
         return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ed09af84f64b..1db91a91e37a 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -23,7 +23,8 @@
                                               RequestResponseMetadata,
                                               UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
@@ -51,11 +52,13 @@ def __init__(
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
-            logger.info(
-                "Overwriting default completion sampling param with: %s",
-                diff_sampling_param)
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default completion sampling params from %s: %s",
+                        source, self.default_sampling_params)
 
     async def create_completion(
         self,
@@ -119,17 +122,14 @@ async def create_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
-                # Build default sampling params
-                default_sampling_params = (
-                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
                         self.model_config.logits_processor_pattern,
-                        default_sampling_params)
+                        self.default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
 
@@ -170,8 +170,8 @@ async def create_completion(
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use
-        # beam search.
+        # results. Noting that best_of is only supported in V0. In addition,
+        # we do not stream the results when use beam search.
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
@@ -317,6 +317,8 @@ async def completion_stream_generator(
                             num_output_top_logprobs=request.logprobs,
                             tokenizer=tokenizer,
                             initial_text_offset=previous_text_lens[i],
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -396,13 +398,7 @@ def request_output_to_completion_response(
         for final_res in final_res_batch:
             prompt_token_ids = final_res.prompt_token_ids
             assert prompt_token_ids is not None
-            prompt_logprobs = final_res.prompt_logprobs
-            if prompt_logprobs:
-                for logprob_dict in prompt_logprobs:
-                    if logprob_dict:
-                        for logprob_values in logprob_dict.values():
-                            if logprob_values.logprob == float('-inf'):
-                                logprob_values.logprob = -9999.0
+            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
@@ -443,6 +439,7 @@ def request_output_to_completion_response(
                         top_logprobs=out_logprobs,
                         tokenizer=tokenizer,
                         num_output_top_logprobs=request.logprobs,
+                        return_as_token_id=request.return_tokens_as_token_ids,
                     )
                 else:
                     logprobs = None
@@ -484,6 +481,7 @@ def _create_completion_logprobs(
         num_output_top_logprobs: int,
         tokenizer: AnyTokenizer,
         initial_text_offset: int = 0,
+        return_as_token_id: Optional[bool] = None,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
         out_text_offset: list[int] = []
@@ -493,11 +491,13 @@ def _create_completion_logprobs(
 
         last_token_len = 0
 
+        should_return_as_token_id = return_as_token_id if \
+            return_as_token_id is not None else self.return_tokens_as_token_ids
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
-                if self.return_tokens_as_token_ids:
+                if should_return_as_token_id:
                     token = f"token_id:{token_id}"
 
                 out_tokens.append(token)
@@ -510,7 +510,7 @@ def _create_completion_logprobs(
                     step_token,
                     token_id,
                     tokenizer,
-                    return_as_token_id=self.return_tokens_as_token_ids,
+                    return_as_token_id=should_return_as_token_id,
                 )
                 token_logprob = max(step_token.logprob, -9999.0)
 
@@ -527,7 +527,7 @@ def _create_completion_logprobs(
                     self._get_decoded_token(top_lp[1],
                                             top_lp[0],
                                             tokenizer,
-                                            return_as_token_id=self.return_tokens_as_token_ids):
+                                            return_as_token_id=should_return_as_token_id):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 59333dbfd24e..125812d2cc01 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -42,7 +42,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
+from vllm.sequence import Logprob, PromptLogprobs
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -535,3 +535,18 @@ def _get_model_name(self,
         if model_name is None:
             return self.models.base_model_paths[0].name
         return model_name
+
+
+def clamp_prompt_logprobs(
+    prompt_logprobs: Union[PromptLogprobs,
+                           None]) -> Union[PromptLogprobs, None]:
+    if prompt_logprobs is None:
+        return prompt_logprobs
+
+    for logprob_dict in prompt_logprobs:
+        if logprob_dict is None:
+            continue
+        for logprob_values in logprob_dict.values():
+            if logprob_values.logprob == float('-inf'):
+                logprob_values.logprob = -9999.0
+    return prompt_logprobs
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 77f016a5e0a4..13565d0ef8dd 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,24 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import io
+import time
 from collections.abc import AsyncGenerator
-from typing import Optional, Union, cast
+from math import ceil
+from typing import Final, Optional, Union, cast
 
 from fastapi import Request
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              RequestResponseMetadata,
-                                              TranscriptionRequest,
-                                              TranscriptionResponse,
-                                              TranscriptionResponseVerbose)
+from vllm.entrypoints.openai.protocol import (
+    DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest,
+    TranscriptionResponse, TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import PlaceholderModule
 
 try:
@@ -140,8 +142,6 @@
 # As per https://platform.openai.com/docs/guides/speech-to-text#overview.
 # TODO configurable
 MAX_AUDIO_CLIP_FILESIZE_MB = 25
-# TODO get from processor.feature_extractor.chunk_length
-MAX_AUDIO_CLIP_DURATION_S = 30
 
 
 class OpenAIServingTranscription(OpenAIServing):
@@ -161,17 +161,23 @@ def __init__(
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        processor = cached_get_processor(model_config.model)
+        self.max_audio_clip_s = processor.feature_extractor.chunk_length
+        self.model_sr = processor.feature_extractor.sampling_rate
+        self.hop_length = processor.feature_extractor.hop_length
+
+        if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
-                diff_sampling_param)
+                self.default_sampling_params)
 
     async def _preprocess_transcription(
         self,
         request: TranscriptionRequest,
         audio_data: bytes,
-    ) -> PromptType:
+    ) -> tuple[PromptType, float]:
         # Validate request
         # TODO language should be optional and can be guessed.
         # For now we default to en. See
@@ -197,9 +203,11 @@ async def _preprocess_transcription(
 
         with io.BytesIO(audio_data) as bytes_:
             y, sr = librosa.load(bytes_)
-        if librosa.get_duration(y=y, sr=sr) > MAX_AUDIO_CLIP_DURATION_S:
+
+        duration = librosa.get_duration(y=y, sr=sr)
+        if duration > self.max_audio_clip_s:
             raise ValueError(
-                f"Maximum clip duration ({MAX_AUDIO_CLIP_DURATION_S}s) "
+                f"Maximum clip duration ({self.max_audio_clip_s}s) "
                 "exceeded.")
 
         prompt = {
@@ -212,13 +220,13 @@ async def _preprocess_transcription(
             "decoder_prompt":
             f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}"
         }
-        return cast(PromptType, prompt)
+        return cast(PromptType, prompt), duration
 
     # TODO (varun) : Make verbose response work !
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest,
         raw_request: Request
-    ) -> Union[TranscriptionResponse, TranscriptionResponseVerbose,
+    ) -> Union[TranscriptionResponse, AsyncGenerator[str, None],
                ErrorResponse]:
         """Transcription API similar to OpenAI's API.
 
@@ -239,8 +247,7 @@ async def create_transcription(
             return self.create_error_response(
                 "Currently only support response_format `text` or `json`")
 
-        # TODO cmpl->transcription?
-        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        request_id = f"trsc-{self._base_request_id(raw_request)}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
@@ -260,7 +267,7 @@ async def create_transcription(
                     "Currently do not support PromptAdapter for Transcription."
                 )
 
-            prompt = await self._preprocess_transcription(
+            prompt, duration_s = await self._preprocess_transcription(
                 request=request,
                 audio_data=audio_data,
             )
@@ -273,9 +280,8 @@ async def create_transcription(
         try:
             # TODO(rob): subtract len of tokenized prompt.
             default_max_tokens = self.model_config.max_model_len
-            default_params = self.model_config.get_diff_sampling_param()
             sampling_params = request.to_sampling_params(
-                default_max_tokens, default_params)
+                default_max_tokens, self.default_sampling_params)
 
             self._log_inputs(
                 request_id,
@@ -293,7 +299,12 @@ async def create_transcription(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        # TODO(rob): figure out a way to pipe streaming in.
+        if request.stream:
+            return self.transcription_stream_generator(request,
+                                                       result_generator,
+                                                       request_id,
+                                                       request_metadata,
+                                                       duration_s)
         # Non-streaming response.
         try:
             assert result_generator is not None
@@ -305,3 +316,106 @@ async def create_transcription(
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
+
+    async def transcription_stream_generator(
+            self, request: TranscriptionRequest,
+            result_generator: AsyncGenerator[RequestOutput, None],
+            request_id: str, request_metadata: RequestResponseMetadata,
+            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        created_time = int(time.time())
+        model_name = request.model
+        chunk_object_type: Final = "transcription.chunk"
+
+        completion_tokens = 0
+        num_prompt_tokens = 0
+
+        include_usage = request.stream_include_usage \
+            if request.stream_include_usage else False
+        include_continuous_usage = request.stream_continuous_usage_stats\
+              if include_usage and request.stream_continuous_usage_stats\
+                else False
+
+        try:
+            async for res in result_generator:
+                # On first result.
+                if res.prompt_token_ids is not None:
+                    # Do not account the 4-tokens `<|startoftranscript|>..`
+                    # Could be negative when language token is not specified.
+                    num_prompt_tokens = max(len(res.prompt_token_ids) - 4, 0)
+                    # NOTE(NickLucche) user can't pass encoder prompts directly
+                    # at least not to Whisper. One indicator of the encoder
+                    # amount of processing is the log-mel spectogram length.
+                    num_prompt_tokens += ceil(audio_duration_s *
+                                              self.model_sr / self.hop_length)
+
+                # We need to do it here, because if there are exceptions in
+                # the result_generator, it needs to be sent as the FIRST
+                # response (by the try...catch).
+
+                # Just one output (n=1) supported.
+                assert len(res.outputs) == 1
+                output = res.outputs[0]
+
+                delta_message = DeltaMessage(content=output.text)
+                completion_tokens += len(output.token_ids)
+
+                if output.finish_reason is None:
+                    # Still generating, send delta update.
+                    choice_data = TranscriptionResponseStreamChoice(
+                        delta=delta_message)
+                else:
+                    # Model is finished generating.
+                    choice_data = TranscriptionResponseStreamChoice(
+                        delta=delta_message,
+                        finish_reason=output.finish_reason,
+                        stop_reason=output.stop_reason)
+
+                chunk = TranscriptionStreamResponse(id=request_id,
+                                                    object=chunk_object_type,
+                                                    created=created_time,
+                                                    choices=[choice_data],
+                                                    model=model_name)
+
+                # handle usage stats if requested & if continuous
+                if include_continuous_usage:
+                    chunk.usage = UsageInfo(
+                        prompt_tokens=num_prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=num_prompt_tokens + completion_tokens,
+                    )
+
+                data = chunk.model_dump_json(exclude_unset=True)
+                yield f"data: {data}\n\n"
+
+            # Once the final token is handled, if stream_options.include_usage
+            # is sent, send the usage.
+            if include_usage:
+                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=completion_tokens,
+                                        total_tokens=num_prompt_tokens +
+                                        completion_tokens)
+
+                final_usage_chunk = TranscriptionStreamResponse(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
+            # report to FastAPI middleware aggregate usage across all choices
+            request_metadata.final_usage_info = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=num_prompt_tokens + completion_tokens)
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            logger.exception("Error in chat completion stream generator.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"
diff --git a/vllm/envs.py b/vllm/envs.py
index 18b732ae839a..7c484f293c99 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -96,7 +96,6 @@
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
-    VLLM_TEST_ENABLE_EP: bool = False
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
@@ -106,6 +105,7 @@
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
 
 
 def get_default_cache_root():
@@ -367,6 +367,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # - "XFORMERS": use XFormers
     # - "ROCM_FLASH": use ROCmFlashAttention
     # - "FLASHINFER": use flashinfer
+    # - "FLASHMLA": use FlashMLA
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
@@ -641,12 +642,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
-    # If set, vLLM will use the experimental expert parallel implementation on
-    # the FusedMoE layer, using tensor parallelism size as expert parallelism
-    # size.
-    "VLLM_TEST_ENABLE_EP":
-    lambda: bool(int(os.getenv("VLLM_TEST_ENABLE_EP", "0"))),
-
     # Number of GPUs per worker in Ray, if it is set to be a fraction,
     # it allows ray to schedule multiple actors on a single GPU,
     # so that users can colocate other actors on the same GPUs as vLLM.
@@ -699,6 +694,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
+
+    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
+    "VLLM_MARLIN_USE_ATOMIC_ADD":
+    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 108f606e2fb8..18ff32155c5f 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import json
 import os
 from collections import defaultdict
 from dataclasses import dataclass
@@ -48,15 +49,38 @@ class RayWorkerMetaData:
 
 
 class RayDistributedExecutor(DistributedExecutorBase):
+    """Ray-based distributed executor"""
+
+    # These env vars are worker-specific, therefore are NOT copied
+    # from the driver to the workers
+    WORKER_SPECIFIC_ENV_VARS = {
+        "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES"
+    }
+
+    config_home = envs.VLLM_CONFIG_ROOT
+    # This file contains a list of env vars that should not be copied
+    # from the driver to the Ray workers.
+    non_carry_over_env_vars_file = os.path.join(
+        config_home, "ray_non_carry_over_env_vars.json")
+    if os.path.exists(non_carry_over_env_vars_file):
+        with open(non_carry_over_env_vars_file) as f:
+            non_carry_over_env_vars = set(json.load(f))
+    else:
+        non_carry_over_env_vars = set()
 
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         if envs.VLLM_USE_V1:
-            # v1 always uses the compiled DAG and SPMD worker.
+            # V1 uses SPMD worker and compiled DAG
             os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
             os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
+
+            # For TPU, avoid compiling NVIDIA's NCCL
+            if current_platform.is_tpu():
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -311,9 +335,9 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
 
         # Environment variables to copy from driver to workers
         env_vars_to_copy = [
-            "VLLM_ATTENTION_BACKEND", "TPU_CHIPS_PER_HOST_BOUNDS",
-            "TPU_HOST_BOUNDS", "VLLM_USE_V1", "VLLM_TRACE_FUNCTION",
-            "VLLM_TORCH_PROFILER_DIR", "VLLM_TEST_ENABLE_EP"
+            v for v in envs.environment_variables
+            if v not in self.WORKER_SPECIFIC_ENV_VARS
+            and v not in self.non_carry_over_env_vars
         ]
 
         # Copy existing env vars to each worker's args
@@ -323,9 +347,14 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
                 if name in os.environ:
                     args[name] = os.environ[name]
 
+        logger.info("non_carry_over_env_vars from config: %s",
+                    self.non_carry_over_env_vars)
         logger.info(
             "Copying the following environment variables to workers: %s",
             [v for v in env_vars_to_copy if v in os.environ])
+        logger.info(
+            "If certain env vars should NOT be copied to workers, add them to "
+            "%s file", self.non_carry_over_env_vars_file)
 
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 6067f9a3c13b..c1bf2fb316d9 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -11,6 +11,7 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -106,10 +107,15 @@ def setup_device_if_necessary(self):
             # on a background thread, so we need to reset torch's current
             # device.
             # We can remove this API after it is fixed in compiled graph.
-            import torch
             assert self.worker is not None, "Worker is not initialized"
             if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
+                if current_platform.is_tpu():
+                    # Not needed
+                    pass
+                else:
+                    import torch
+                    torch.cuda.set_device(self.worker.device)
+
                 self.compiled_dag_cuda_device_set = True
 
         def execute_model_ray(
@@ -184,8 +190,9 @@ def _verify_bundles(placement_group: "PlacementGroup",
             f"group {placement_group.id}. Node id -> bundles "
             f"{node_id_to_bundle}. "
             "You don't have enough GPUs available in a current node. Check "
-            "`ray status` to see if you have available GPUs in a node "
-            f"{driver_node_id} before starting an vLLM engine.")
+            "`ray status` and `ray list nodes` to see if you have available "
+            "GPUs in a node `{driver_node_id}` before starting an vLLM engine."
+        )
 
     for node_id, bundles in node_id_to_bundle.items():
         if len(bundles) < parallel_config.tensor_parallel_size:
@@ -225,8 +232,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         wait_interval *= 2
         logger.info(
             "Waiting for creating a placement group of specs for "
-            "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources,"
+            "%d seconds. specs=%s. Check `ray status` and "
+            "`ray list nodes` to see if you have enough resources,"
             " and make sure the IP addresses used by ray cluster"
             " are the same as VLLM_HOST_IP environment variable"
             " specified in each node if you are running on a multi-node.",
@@ -238,8 +245,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         raise ValueError(
             "Cannot provide a placement group of "
             f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` to make sure the cluster has enough resources."
-        ) from None
+            "`ray status` and `ray list nodes` to make sure the cluster has "
+            "enough resources.") from None
 
 
 def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index e041215de660..8c004c790fcb 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -103,6 +103,7 @@ def _init_executor(self) -> None:
         # so we can use the env:// method.
         # required env vars:
         # - RANK
+        # - LOCAL_RANK
         # - MASTER_ADDR
         # - MASTER_PORT
         distributed_init_method = "env://"
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index c3d20cff426c..e195a03c5cac 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -25,16 +25,21 @@
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+@dataclass
+class DPMetadata:
+    cu_tokens_across_dp_cpu: torch.Tensor
+
+
 @dataclass
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
-    attn_layers: dict[str, Any]
+    no_compile_layers: dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
     attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
-    num_tokens_across_dp: Optional[
-        list[int]] = None  # set dynamically for each forward pass
+    # set dynamically for each forward pass
+    dp_metadata: Optional[DPMetadata] = None
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -61,7 +66,7 @@ def set_forward_context(attn_metadata: Any,
     need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
-    num_tokens_across_dp = None
+    dp_metadata: Optional[DPMetadata] = None
     if vllm_config.parallel_config.data_parallel_size > 1:
         dp_size = vllm_config.parallel_config.data_parallel_size
         dp_rank = vllm_config.parallel_config.data_parallel_rank
@@ -82,15 +87,17 @@ def set_forward_context(attn_metadata: Any,
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        num_tokens_across_dp = num_tokens_tensor.tolist()
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
+        dp_metadata = DPMetadata(cu_tokens_across_dp_cpu)
 
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        attn_layers=vllm_config.compilation_config.static_forward_context,
+        no_compile_layers=vllm_config.compilation_config.
+        static_forward_context,
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
-        num_tokens_across_dp=num_tokens_across_dp)
+        dp_metadata=dp_metadata)
     try:
         yield
     finally:
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 2ffebeee392a..138a8f61107b 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cached_property
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
-                    Optional, Tuple, Union, cast)
+from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
 import torch
 from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
@@ -26,7 +26,7 @@ class TextPrompt(TypedDict):
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -38,10 +38,10 @@ class TextPrompt(TypedDict):
 class TokensPrompt(TypedDict):
     """Schema for a tokenized prompt."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """A list of token IDs to pass to the model."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """A list of token type IDs to pass to the cross encoder model."""
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
@@ -50,7 +50,7 @@ class TokensPrompt(TypedDict):
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -115,7 +115,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 
     decoder_prompt: Optional[_T2_co]
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
 
 
 PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
@@ -136,10 +136,10 @@ class TokenInputs(TypedDict):
     type: Literal["token"]
     """The type of inputs."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """The token IDs of the prompt."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
     prompt: NotRequired[str]
@@ -164,12 +164,12 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
-    multi_modal_hashes: NotRequired[List[str]]
+    multi_modal_hashes: NotRequired[list[str]]
     """
     The hashes of the multi-modal data.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -179,14 +179,14 @@ class TokenInputs(TypedDict):
 
 
 def token_inputs(
-    prompt_token_ids: List[int],
-    token_type_ids: Optional[List[int]] = None,
+    prompt_token_ids: list[int],
+    token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
-    multi_modal_hashes: Optional[List[str]] = None,
+    multi_modal_hashes: Optional[list[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
@@ -255,7 +255,7 @@ def prompt(self) -> Optional[str]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def prompt_token_ids(self) -> List[int]:
+    def prompt_token_ids(self) -> list[int]:
         inputs = self.inputs
 
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
@@ -264,7 +264,7 @@ def prompt_token_ids(self) -> List[int]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def token_type_ids(self) -> List[int]:
+    def token_type_ids(self) -> list[int]:
         inputs = self.inputs
 
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
@@ -294,7 +294,7 @@ def multi_modal_data(self) -> "MultiModalDataDict":
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
+    def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -306,7 +306,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def multi_modal_hashes(self) -> List[str]:
+    def multi_modal_hashes(self) -> list[str]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -331,7 +331,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
+    def mm_processor_kwargs(self) -> dict[str, Any]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -355,7 +355,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]:
 def build_explicit_enc_dec_prompt(
     encoder_prompt: _T1,
     decoder_prompt: Optional[_T2],
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
 ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
     if mm_processor_kwargs is None:
         mm_processor_kwargs = {}
@@ -368,9 +368,9 @@ def build_explicit_enc_dec_prompt(
 def zip_enc_dec_prompts(
     enc_prompts: Iterable[_T1],
     dec_prompts: Iterable[Optional[_T2]],
-    mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]],
-                                        Dict[str, Any]]] = None,
-) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
+    mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
+                                        dict[str, Any]]] = None,
+) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
     :class:`ExplicitEncoderDecoderPrompt` instances.
@@ -380,12 +380,12 @@ def zip_enc_dec_prompts(
     provided, it will be zipped with the encoder/decoder prompts.
     """
     if mm_processor_kwargs is None:
-        mm_processor_kwargs = cast(Dict[str, Any], {})
+        mm_processor_kwargs = cast(dict[str, Any], {})
     if isinstance(mm_processor_kwargs, dict):
         return [
             build_explicit_enc_dec_prompt(
                 encoder_prompt, decoder_prompt,
-                cast(Dict[str, Any], mm_processor_kwargs))
+                cast(dict[str, Any], mm_processor_kwargs))
             for (encoder_prompt,
                  decoder_prompt) in zip(enc_prompts, dec_prompts)
         ]
@@ -399,7 +399,7 @@ def zip_enc_dec_prompts(
 
 def to_enc_dec_tuple_list(
     enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
-) -> List[Tuple[_T1, Optional[_T2]]]:
+) -> list[tuple[_T1, Optional[_T2]]]:
     return [(enc_dec_prompt["encoder_prompt"],
              enc_dec_prompt["decoder_prompt"])
             for enc_dec_prompt in enc_dec_prompts]
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 454d9d8303b7..ed1056948d80 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Literal, Sequence, TypedDict, Union, cast, overload
+from collections.abc import Sequence
+from typing import Literal, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
 
@@ -17,24 +18,24 @@ class ParsedText(TypedDict):
 
 
 class ParsedTokens(TypedDict):
-    content: List[int]
+    content: list[int]
     is_tokens: Literal[True]
 
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
+        prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
     ...
 
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
+        prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
     ...
 
 
 def parse_and_batch_prompt(
-    prompt: Union[str, List[str], List[int], List[List[int]]],
+    prompt: Union[str, list[str], list[int], list[list[int]]],
 ) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
     if isinstance(prompt, str):
         # case 1: a string
@@ -46,16 +47,16 @@ def parse_and_batch_prompt(
 
         if is_list_of(prompt, str):
             # case 2: array of strings
-            prompt = cast(List[str], prompt)
+            prompt = cast(list[str], prompt)
             return [
                 ParsedText(content=elem, is_tokens=False) for elem in prompt
             ]
         if is_list_of(prompt, int):
             # case 3: array of tokens
-            prompt = cast(List[int], prompt)
+            prompt = cast(list[int], prompt)
             return [ParsedTokens(content=prompt, is_tokens=True)]
         if is_list_of(prompt, list):
-            prompt = cast(List[List[int]], prompt)
+            prompt = cast(list[list[int]], prompt)
             if len(prompt[0]) == 0:
                 raise ValueError("please provide at least one prompt")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 206a76e52b7a..f56cff292b68 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-from typing import List, Mapping, Optional, Tuple, Union, cast
+from collections.abc import Mapping
+from typing import Optional, Union, cast
 
 from typing_extensions import assert_never
 
@@ -92,7 +93,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
 
         return dec_start_token_id
 
-    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
+    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
         '''
         Specifically for encoder/decoder models:
         generate a default decoder prompt for when
@@ -130,8 +131,8 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
 
     def _prepare_decoder_input_ids_for_generation(
         self,
-        decoder_input_ids: Optional[List[int]],
-    ) -> List[int]:
+        decoder_input_ids: Optional[list[int]],
+    ) -> list[int]:
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
 
@@ -168,9 +169,9 @@ def _prepare_decoder_input_ids_for_generation(
 
     def _apply_prompt_adapter(
         self,
-        prompt_token_ids: List[int],
+        prompt_token_ids: list[int],
         prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> List[int]:
+    ) -> list[int]:
         if prompt_adapter_request:
             prompt_token_ids = (
                 [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
@@ -183,7 +184,7 @@ def _tokenize_prompt(
         prompt: str,
         request_id: str,
         lora_request: Optional[LoRARequest],
-    ) -> List[int]:
+    ) -> list[int]:
         """
         Apply the model's tokenizer to a text prompt, returning the
         corresponding token IDs.
@@ -211,7 +212,7 @@ async def _tokenize_prompt_async(
         prompt: str,
         request_id: str,
         lora_request: Optional[LoRARequest],
-    ) -> List[int]:
+    ) -> list[int]:
         """Async version of :meth:`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
         add_special_tokens = None
@@ -250,10 +251,11 @@ def _can_process_multimodal(self) -> bool:
 
     def _process_multimodal(
         self,
-        prompt: Union[str, List[int]],
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -274,14 +276,16 @@ def _process_multimodal(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
 
     async def _process_multimodal_async(
         self,
-        prompt: Union[str, List[int]],
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
         # At the moment on model (PrithviGeoSpatialMAE) requires to be
@@ -299,13 +303,15 @@ async def _process_multimodal_async(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
 
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -315,6 +321,7 @@ def _prompt_to_llm_inputs(
         * request_id
         * prompt: single encoder or decoder input prompt
         * lora_request: this is only valid for decoder prompts
+        * return_mm_hashes: whether to return multimodal hashes
 
         Returns:
 
@@ -349,6 +356,7 @@ def _prompt_to_llm_inputs(
                     multi_modal_data,
                     mm_processor_kwargs,
                     lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
                 )
 
             return token_inputs(
@@ -504,7 +512,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
         self,
         inputs: SingletonInputs,
         decoder_inputs_to_override: Optional[SingletonInputs] = None,
-    ) -> Tuple[SingletonInputs, SingletonInputs]:
+    ) -> tuple[SingletonInputs, SingletonInputs]:
         """
         For encoder/decoder models only:
         Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
@@ -695,6 +703,7 @@ def _process_decoder_only_prompt(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -706,6 +715,7 @@ def _process_decoder_only_prompt(
         * request_id
         * lora_request
         * prompt_adapter_request
+        * return_mm_hashes
 
         Returns:
 
@@ -729,6 +739,7 @@ async def _process_decoder_only_prompt_async(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """Async version of :meth:`_process_decoder_only_prompt`."""
         prompt_comps = await self._prompt_to_llm_inputs_async(
@@ -748,9 +759,13 @@ def preprocess(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return self._process_encoder_decoder_prompt(
@@ -768,6 +783,7 @@ def preprocess(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
         )
 
     async def preprocess_async(
@@ -776,9 +792,13 @@ async def preprocess_async(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """Async version of :meth:`preprocess`."""
         if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return await self._process_encoder_decoder_prompt_async(
@@ -796,4 +816,5 @@ async def preprocess_async(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
         )
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 691fcd7dc53f..a0bd8f278fd0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,9 +2,10 @@
 
 import functools
 from collections import UserDict
+from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple,
-                    Optional, Protocol, Union)
+from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional,
+                    Protocol, Union)
 
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
@@ -331,10 +332,14 @@ def dummy_data_for_profiling(
 
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = mm_registry.create_processor(model_config, tokenizer)
+            processor = mm_registry.create_processor(model_config,
+                                                     tokenizer,
+                                                     disable_cache=True)
             profiler = MultiModalProfiler(processor)
-            dummy_data = profiler.get_dummy_data(
-                seq_len, is_encoder_data=is_encoder_data)
+            dummy_data_factory = (profiler.get_encoder_dummy_data
+                                  if is_encoder_data else
+                                  profiler.get_decoder_dummy_data)
+            dummy_data = dummy_data_factory(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 5a4d991da1b5..1c1f76702ddb 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -274,6 +274,10 @@ def can_replace_layer(
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
+    @property
+    def weight(self):
+        return self.base_layer.weight
+
 
 class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
@@ -395,16 +399,47 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
         self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
                                             self.lora_b_stacked,
                                             self.lora_bias_stacked, 1.0,
                                             self.output_slices)
         return output
 
-    @classmethod
-    def get_source_layer(cls, source_layer: nn.Module) -> type:
-        # Check parent_cls in case source_layer is a HFCompatibleLinear.
-        return getattr(source_layer, "parent_cls", type(source_layer))
+    @property
+    def weight(self) -> torch.Tensor:
+
+        # unquantizedLinear
+        if hasattr(self.base_layer, "weight"):
+            return self.base_layer.weight
+        # Compressed Tensor
+        elif hasattr(self.base_layer, "weight_packed"):
+            return self.base_layer.weight_packed
+        # GPTQ/AWQ
+        elif hasattr(self.base_layer, "qweight"):
+            return self.base_layer.qweight
+        # marlin
+        elif hasattr(self.base_layer, "B"):
+            return self.base_layer.B
+        # HQQ marlin
+        elif hasattr(self.base_layer, "W_q"):
+            return self.base_layer.W_q
+        else:
+            raise ValueError(f"Unsupported base layer: {self.base_layer}")
+
+    @property
+    def bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.base_layer, "bias"):
+            return self.base_layer.bias
+        else:
+            return None
 
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
@@ -418,7 +453,7 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -436,6 +471,10 @@ def forward(
 
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
+
+        if not self.base_layer.return_bias:
+            return output
+
         return output, output_bias
 
     # ReplicatedLinear should always be replaced, regardless of the fully
@@ -448,8 +487,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ReplicatedLinear
+        return type(source_layer) is ReplicatedLinear
 
 
 class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
@@ -512,7 +550,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -532,6 +570,10 @@ def forward(
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
             output = output_parallel
+
+        if not self.base_layer.return_bias:
+            return output
+
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
         return output, output_bias
@@ -545,9 +587,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ColumnParallelLinear or (
-            source_layer is MergedColumnParallelLinear
+        return type(source_layer) is ColumnParallelLinear or (
+            type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
 
 
@@ -689,8 +730,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is MergedColumnParallelLinear
+        return (type(source_layer) is MergedColumnParallelLinear
                 and len(packed_modules_list) == 2)
 
 
@@ -758,8 +798,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is QKVParallelLinear and len(
+        return type(source_layer) is QKVParallelLinear and len(
             packed_modules_list) == 1
 
 
@@ -820,8 +859,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is QKVParallelLinear
+        return (type(source_layer) is QKVParallelLinear
                 and len(packed_modules_list) == 3)
 
 
@@ -855,7 +893,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of RowParallelLinear
 
         Args:
@@ -890,12 +928,11 @@ def forward(
         else:
             output = output_
             output_bias = self.base_layer.bias
-        return output, output_bias
 
-    @property
-    def weight(self):
-        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
-                else self.base_layer.qweight)
+        if not self.base_layer.return_bias:
+            return output
+
+        return output, output_bias
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -906,8 +943,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is RowParallelLinear
+        return type(source_layer) is RowParallelLinear
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 9f1b14b49704..610cbf87f66a 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -67,16 +67,6 @@ def from_layer(layer: nn.Module,
                                       packed_modules_list=packed_modules_list,
                                       model_config=model_config):
             instance_layer = lora_cls(layer)
-            if layer.__class__.__name__ == "HFCompatibleLinear":
-                # HACK:  Make the forward method compatible with the original
-                # forward method of the instance_layer.
-                original_forward = instance_layer.forward
-
-                def new_forward(input):
-                    input = input.squeeze(0)
-                    return original_forward(input)[0]  # noqa: B023
-
-                instance_layer.forward = new_forward
             instance_layer.create_lora_weights(max_loras, lora_config,
                                                model_config)
             return instance_layer
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 86f6f0e5f907..6b9a855eeccc 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -112,6 +112,7 @@ async def get_guided_decoding_logits_processor(
     reasoner = get_reasoner(tokenizer, reasoning_backend)
 
     guided_params = maybe_backend_fallback(guided_params)
+
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index db5d738f42e4..de24eaa1fb6a 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -43,7 +43,7 @@ class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
         self._guide: Guide = guide
-        self._reasoner = reasoner
+        self._reasoner: Optional[Reasoner] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -54,10 +54,14 @@ def __call__(self, input_ids: List[int],
 
         # Skip the structured logits processing if reasoning is not finished.
         # reasoner is not None only when `--enable-reasoning` is set.
-        if self._reasoner is not None and \
-        not self._reasoner.is_reasoning_end(
-                input_ids):
-            return scores
+        if self._reasoner is not None:
+            if not self._reasoner.is_reasoning_end(input_ids):
+                return scores
+            else:
+                # Remove the reasoning tokens from the input_ids
+                # We need this because our implementation relies on the
+                # hash of the input_ids to store the FSM state.
+                input_ids = self._reasoner.extract_content(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index 5a91f791d45b..d930d3dbe94c 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -4,10 +4,13 @@
 
 from transformers import PreTrainedTokenizer
 
+from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
     DeepSeekReasoner)
 from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
 
+logger = init_logger(__name__)
+
 
 def get_reasoner(tokenizer: PreTrainedTokenizer,
                  reasoning_backend: str | None) -> Reasoner | None:
@@ -17,7 +20,12 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
     else:
-        raise ValueError(f"Unknown reasoning backend '{reasoning_backend}'")
+        # Raise a warning for unknown reasoning backend and return None
+        # We cannot raise an error here because some reasoning models
+        # may not have a corresponding Reasoner class.
+        logger.warning("Unknown reasoning backend %s for structured outputs ",
+                       reasoning_backend)
+        return None
 
 
 __all__ = ["Reasoner", "get_reasoner"]
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
index e762fb0659de..7e61e6a9620c 100644
--- a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
+++ b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
@@ -26,3 +26,13 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.end_token_id in input_ids
+
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids or \
+            input_ids.index(self.end_token_id) + 1 == len(input_ids):
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
index 5db0c9bc7850..df21b1db6221 100644
--- a/vllm/model_executor/guided_decoding/reasoner/reasoner.py
+++ b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
@@ -17,3 +17,7 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
     @abstractmethod
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         pass
+
+    @abstractmethod
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index ce278c15ab3b..9405ef93e145 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -392,7 +392,7 @@ def __call__(self, input_ids: list[int],
     def clone(self) -> XGrammarLogitsProcessor:
         """Create a new instance with shared compiled grammar
           but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config)
+        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner)
 
         # Share the compiled grammar context (immutable after compilation)
         new_processor.ctx = self.ctx
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7e3d05509bb5..f2dbf693660d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1244,15 +1244,20 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     config = get_config_func(M)
 
-    intermediate_cache1 = torch.empty((M, top_k_num, N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
+    # We can reuse the memory between these because by the time we need
+    # cache3, we're done with cache1
+    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(
+        (M, topk_ids.shape[1], N))
+    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
+        (M, topk_ids.shape[1], w2.shape[1]))
+
+    # This needs separate memory since it's used concurrently with cache1
     intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
-    intermediate_cache3 = torch.empty((M, top_k_num, w2.shape[1]),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
 
     if hidden_states.dtype == torch.bfloat16:
         compute_type = tl.bfloat16
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f9a0d74f84ad..f398f6d653f5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -9,9 +9,11 @@
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
-from vllm.distributed import (get_tensor_model_parallel_rank,
+from vllm.config import get_current_vllm_config
+from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
@@ -19,6 +21,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
+from vllm.utils import direct_register_custom_op
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -258,6 +261,51 @@ def forward_tpu(
     forward_native = forward_cuda
 
 
+def determine_expert_map(
+        ep_size: int, ep_rank: int,
+        global_num_experts: int) -> Tuple[int, Optional[torch.Tensor]]:
+    """
+        Calculates how many experts should be assigned to each rank for EP and
+        creates a mapping from global to local expert index. Experts are
+        distributed evenly across ranks. Any remaining are assigned to the
+        last rank.
+
+        Args:
+            ep_size (int): The size of the expert parallel group
+            global_num_experts (int): The total number of experts in the model.
+
+        Returns:
+            Tuple[int, Optional[torch.Tensor]]: A tuple containing:
+                - local_num_experts (int): The number of experts assigned
+                    to the current rank.
+                - expert_map (Optional[torch.Tensor]): A tensor of shape
+                    (global_num_experts,) mapping from global to local index.
+                    Contains -1 for experts not assigned to the current rank.
+                    Returns None if ep_size is 1.
+        """
+    assert ep_size > 0
+    if ep_size == 1:
+        return (global_num_experts, None)
+
+    local_num_experts = global_num_experts // ep_size
+
+    # Create a tensor of size num_experts filled with -1
+    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
+    # Create a expert map for the local experts
+    if ep_rank < (ep_size - 1):
+        # Each non-last rank gets local_num_experts experts.
+        expert_map[ep_rank * local_num_experts:
+                        (ep_rank + 1) * local_num_experts] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    else:
+        # All remaining experts are assigned to the last rank.
+        local_num_experts = (global_num_experts - ep_rank * local_num_experts)
+
+        expert_map[-local_num_experts:] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    return (local_num_experts, expert_map)
+
+
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
@@ -294,6 +342,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         ep_size: Optional[int] = None,
+        dp_size: Optional[int] = None,
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
@@ -305,16 +354,54 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
 
+        # Note: here we guard against accessing the TP and DP groups when
+        # uninitialized (this happens when testing)
         self.tp_size = (tp_size if tp_size is not None else
                         get_tensor_model_parallel_world_size())
-        if envs.VLLM_TEST_ENABLE_EP:
-            self.ep_size = self.tp_size
+        tp_rank = 0 if self.tp_size == 1 else get_tensor_model_parallel_rank()
+        self.dp_size = (dp_size
+                        if dp_size is not None else get_dp_group().world_size)
+        self.dp_rank = (0
+                        if self.dp_size == 1 else get_dp_group().rank_in_group)
+        self.global_num_experts = num_experts
+
+        # Use expert parallelism instead of tensor parallelism?
+        vllm_config = get_current_vllm_config()
+        use_ep = (vllm_config.parallel_config.enable_expert_parallel
+                  and self.tp_size > 1)
+
+        # For smuggling this layer into the fused moe custom op
+        self.use_direct_call = self.dp_size == 1
+        if not self.use_direct_call:
+            compilation_config = vllm_config.compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError("Duplicate layer name: {}".format(prefix))
+            compilation_config.static_forward_context[prefix] = self
+            self.layer_name = prefix
+
+        if use_ep:
+            # Set TP size to 1 to adjust for EP and adjust EP size and rank
+            # for DP attention.
+            self.ep_rank = tp_rank + self.tp_size * self.dp_rank
+            self.tp_rank = 0
+            self.ep_size = self.tp_size * self.dp_size
             self.tp_size = 1
+
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                ep_size=self.ep_size,
+                ep_rank=self.ep_rank,
+                global_num_experts=self.global_num_experts)
         else:
+            # Adjust TP size for DP attention
+            self.tp_rank = tp_rank + self.tp_size * self.dp_rank
+            self.ep_rank = 0
+            self.tp_size = self.tp_size * self.dp_size
             self.ep_size = 1
+            self.local_num_experts = self.global_num_experts
+            self.expert_map = None
         self.top_k = top_k
         self.global_num_experts = num_experts
-        self.local_num_experts = self.global_num_experts // self.ep_size
+
         assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
@@ -328,26 +415,6 @@ def __init__(
         self.scoring_func = scoring_func
         self.e_score_correction_bias = e_score_correction_bias
         self.activation = activation
-        self.expert_map = None
-
-        if self.ep_size > 1:
-            # Create a tensor of size num_experts filled with -1
-            self.expert_map = torch.full((self.global_num_experts, ),
-                                         -1,
-                                         dtype=torch.int32)
-            # Create a expert map for the local experts
-            ep_rank = get_tensor_model_parallel_rank()
-            if ep_rank < (self.ep_size - 1):
-                # Each non-last rank gets local_num_experts experts.
-                self.expert_map[ep_rank * self.local_num_experts:
-                                (ep_rank + 1) * self.local_num_experts] = \
-                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
-            else:
-                # All remaining experts are assigned to the last rank.
-                self.local_num_experts = (self.global_num_experts -
-                                          ep_rank * self.local_num_experts)
-                self.expert_map[-self.local_num_experts:] = \
-                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -505,9 +572,6 @@ def weight_loader(self, param: torch.nn.Parameter,
         if expert_id == -1:
             return
 
-        # TP rank is set to 0 if EP is enabled
-        tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
-
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
@@ -551,8 +615,7 @@ def weight_loader(self, param: torch.nn.Parameter,
             final_shape = list(loaded_weight.shape)
             if shard_id in ["w1", "w3"]:
                 final_shape[1] *= 2
-            final_shape[shard_dim] = final_shape[
-                shard_dim] // get_tensor_model_parallel_world_size()
+            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
             param.materialize(final_shape, dtype=loaded_weight.dtype)
 
         expert_data = param.data if full_load else param.data[expert_id]
@@ -579,7 +642,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                              shard_id=shard_id,
                              loaded_weight=loaded_weight,
                              expert_data=expert_data,
-                             tp_rank=tp_rank)
+                             tp_rank=self.tp_rank)
             return
 
         # Case weight scales and zero_points
@@ -596,7 +659,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank)
+                    tp_rank=self.tp_rank)
             elif quant_method in [
                     FusedMoeWeightScaleSupported.GROUP.value,
                     FusedMoeWeightScaleSupported.BLOCK.value,
@@ -606,7 +669,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank,
+                    tp_rank=self.tp_rank,
                     load_full_w2=getattr(param, "load_full_w2", False))
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                 self._load_per_tensor_weight_scale(shard_id=shard_id,
@@ -633,7 +696,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                 shard_dim=shard_dim,
                 loaded_weight=loaded_weight,
                 expert_data=expert_data,
-                tp_rank=tp_rank)
+                tp_rank=self.tp_rank)
             return
 
     @staticmethod
@@ -677,10 +740,45 @@ def select_experts(hidden_states: torch.Tensor,
 
         return topk_weights, topk_ids
 
+    def naive_multicast(self, x: torch.Tensor,
+                        cu_tokens_across_dp_cpu: torch.Tensor):
+        assert (len(x.shape) == 2)
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(get_dp_group().world_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            get_dp_group().broadcast(buffer[start:end, :], idx)
+
+        return buffer
+
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
+        if self.use_direct_call:
+            return self.forward_impl(hidden_states, router_logits)
+        else:
+            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
+                                              self.layer_name)
+
+    def forward_impl(self, hidden_states: torch.Tensor,
+                     router_logits: torch.Tensor):
         assert self.quant_method is not None
 
+        if self.dp_size > 1:
+            cu_tokens_across_dp_cpu = get_forward_context(
+            ).dp_metadata.cu_tokens_across_dp_cpu
+
+            hidden_states = self.naive_multicast(hidden_states,
+                                                 cu_tokens_across_dp_cpu)
+            router_logits = self.naive_multicast(router_logits,
+                                                 cu_tokens_across_dp_cpu)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -699,6 +797,14 @@ def forward(self, hidden_states: torch.Tensor,
             activation=self.activation,
         )
 
+        if self.dp_size > 1:
+            start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+                self.dp_rank - 1]
+            end = cu_tokens_across_dp_cpu[self.dp_rank]
+
+            all_hidden_states = get_dp_group().all_reduce(final_hidden_states)
+            final_hidden_states = all_hidden_states[start:end, :]
+
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             # Default set to False. (May have to add shared expert outputs.)
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -769,3 +875,26 @@ def extra_repr(self) -> str:
         s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501
 
         return s
+
+
+def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                layer_name: str) -> torch.Tensor:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    assert self.quant_method is not None
+
+    return self.forward_impl(hidden_states, router_logits)
+
+
+def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                     layer_name: str) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="moe_forward",
+    op_func=moe_forward,
+    mutates_args=[],
+    fake_impl=moe_forward_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 079f6184e94b..ca230913861a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,7 +2,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -151,6 +151,7 @@ class LinearBase(torch.nn.Module):
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
@@ -161,6 +162,8 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         super().__init__()
 
@@ -177,9 +180,11 @@ def __init__(
         else:
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
+        self.return_bias = return_bias
 
-    def forward(self,
-                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         raise NotImplementedError
 
 
@@ -197,20 +202,25 @@ class ReplicatedLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         super().__init__(input_size,
                          output_size,
                          skip_bias_add,
                          params_dtype,
                          quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
@@ -253,12 +263,15 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             f"to a parameter of size {param.size()}")
         param.data.copy_(loaded_weight)
 
-    def forward(self,
-                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)
         output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -292,16 +305,20 @@ class ColumnParallelLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj) 
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[list[int]] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[list[int]] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         # Divide the weight matrix along the last dimension.
         self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size_per_partition = input_size
@@ -314,8 +331,13 @@ def __init__(self,
                 for output_size in self.output_sizes
             ]
 
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
 
         self.gather_output = gather_output
 
@@ -392,7 +414,9 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.reshape(1)
         param.load_column_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -404,6 +428,8 @@ def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -438,15 +464,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_sizes: list[int],
-                 bias: bool = True,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -457,7 +487,8 @@ def __init__(self,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
                          quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
     def weight_loader(self,
                       param: Parameter,
@@ -710,16 +741,20 @@ class QKVParallelLinear(ColumnParallelLinear):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 head_size: int,
-                 total_num_heads: int,
-                 total_num_kv_heads: Optional[int] = None,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         self.hidden_size = hidden_size
         self.head_size = head_size
         self.total_num_heads = total_num_heads
@@ -752,7 +787,8 @@ def __init__(self,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
                          quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
         shard_offset_mapping = {
@@ -1047,16 +1083,20 @@ class RowParallelLinear(LinearBase):
         quant_config: Quantization configure.
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 input_is_parallel: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 reduce_results: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         # Divide the weight matrix along the first dimension.
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -1064,8 +1104,13 @@ def __init__(self,
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
 
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
@@ -1144,7 +1189,9 @@ def weight_loader_v2(self, param: BasevLLMParameter,
 
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -1168,6 +1215,8 @@ def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
 
         output_bias = self.bias if self.skip_bias_add else None
 
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -1177,3 +1226,98 @@ def extra_repr(self) -> str:
         s += f", tp_size={self.tp_size}"
         s += f", reduce_results={self.reduce_results}"
         return s
+
+
+class QKVCrossParallelLinear(torch.nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 head_size: int,
+                 total_num_heads: int,
+                 total_num_kv_heads: Optional[int] = None,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        # Empty placeholders for loading as a single module.
+        self.weight = torch.nn.Parameter()
+        set_weight_attrs(self.weight, {
+            "weight_loader": self.weight_loader_weight,
+        })
+        # Use a dictionary to avoid submodules parameters auto-registration:
+        # drop-in replacement for a `QKVParallelLinear` module.
+        self.proj = dict()
+        self.proj["q_proj_decoder"] = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=total_num_heads * head_size,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.q_proj_decoder")
+
+        self.proj["kv_proj_encoder"] = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=head_size,
+            total_num_heads=0,
+            total_num_kv_heads=total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.kv_proj_encoder")
+
+        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
+
+        if bias:
+            self.bias = torch.nn.Parameter()
+            set_weight_attrs(self.bias, {
+                "weight_loader": self.weight_loader_bias,
+            })
+
+    @property
+    def q_proj_decoder(self):
+        return self.proj["q_proj_decoder"]
+
+    @property
+    def kv_proj_encoder(self):
+        return self.proj["kv_proj_encoder"]
+
+    def forward(self, decoder_hidden_states, encoder_hidden_states):
+        q, _ = self.q_proj_decoder(decoder_hidden_states)
+        if encoder_hidden_states is None:
+            # Encoder KV already cached.
+            k = None
+            v = None
+        else:
+            # Prefill phase, encoder KV cached here.
+            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
+            # Split kv in half
+            k, v = kv_enc.split(self.kv_size, dim=-1)
+        return q, k, v
+
+    def weight_loader_weight(self,
+                             param: torch.nn.Parameter,
+                             loaded_weight: torch.Tensor,
+                             loaded_shard_id: Optional[str] = None):
+        # NOTE Use QKV/ColumnParallel weight_loader, ignore placeholder param.
+        param = self.q_proj_decoder.weight if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.weight
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
+
+    def weight_loader_bias(self,
+                           param: torch.nn.Parameter,
+                           loaded_weight: torch.Tensor,
+                           loaded_shard_id: Optional[str] = None):
+        param = self.q_proj_decoder.bias if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.bias
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index b326e27844fe..de9e0d936c7d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -9,8 +9,8 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -25,7 +25,7 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -141,12 +141,9 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            out_dtype=self.out_dtype,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 3de7fda6857c..c7fac662c44f 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -11,14 +11,12 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz)
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter)
 from vllm.platforms import current_platform
@@ -37,6 +35,7 @@ def __init__(self, ignore_list: List[str], input_scale_ub: float):
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = not current_platform.has_device_capability(89)
+        self.fp8_linear = Fp8LinearOp()
 
     @classmethod
     def get_name(cls) -> str:
@@ -73,7 +72,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
         self.out_dtype = torch.get_default_dtype()
 
     def create_weights(
@@ -160,13 +159,10 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            out_dtype=self.out_dtype,
-            input_scale=None,
-            input_scale_ub=layer.input_scale_ub,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=None,
+                                     input_scale_ub=layer.input_scale_ub,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7f664001ceab..815868d03e44 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, apply_fp8_linear, convert_to_channelwise,
+    Fp8LinearOp, all_close_1d, convert_to_channelwise,
     cutlass_block_fp8_supported, cutlass_fp8_supported,
     maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
     per_tensor_dequantize, requantize_with_max_scale)
@@ -158,7 +158,6 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
@@ -175,6 +174,10 @@ def __init__(self, quant_config: Fp8Config):
             # Marlin doesn't support block-wise fp8
             self.use_marlin = False
 
+        self.fp8_linear = Fp8LinearOp(
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=cutlass_fp8_supported())
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -407,16 +410,12 @@ def apply(self,
                 cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
             )
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            out_dtype=self.out_dtype,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            # Default to using per_token quantization if cutlass is supported
-            use_per_token_if_dynamic=self.cutlass_fp8_supported)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ba176e4a567c..d4e971776934 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -32,7 +32,7 @@ def get_name(self) -> str:
         return "gguf"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
-        return [torch.half]
+        return [torch.half, torch.bfloat16, torch.float32]
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -134,6 +134,7 @@ def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
+        self.params_dtype = params_dtype
         output_size_per_partition = sum(output_partition_sizes)
 
         tensor_shape = (output_size_per_partition, input_size_per_partition)
@@ -326,7 +327,7 @@ def embedding(self, layer: torch.nn.Module,
         x_flat = x.flatten()
         quant = torch.index_select(qweight, dim=0, index=x_flat)
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0])
+                                      x_flat.shape[0]).to(self.params_dtype)
         return dequant.view(*x.shape, hidden_size)
 
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 21db8ccba059..9f53ffc1d7f6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -569,7 +569,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_k=layer.w2_scales.shape[1] *
+            (self.quant_config.group_size if self.quant_config.group_size != -1
+             else self.quant_config.pack_factor),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 36711a7a5098..1f8af8d678cd 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,7 +12,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
+    Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
 
@@ -95,7 +95,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptFp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp()
 
     def create_weights(
         self,
@@ -157,10 +157,8 @@ def apply(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 1ded5389e5f4..592ffc5dad13 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    Fp8LinearOp)
 from vllm.platforms import current_platform
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -93,6 +93,8 @@ def __init__(self, quant_config: PTPCFp8Config):
         super().__init__(quant_config=quant_config)
         # Force weight quantization
         self.quant_config.is_checkpoint_fp8_serialized = False
+        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=False,
+                                      use_per_token_if_dynamic=True)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
@@ -115,11 +117,9 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(input=x,
-                                weight=layer.weight,
-                                weight_scale=layer.weight_scale,
-                                input_scale=None,
-                                input_scale_ub=None,
-                                bias=bias,
-                                cutlass_fp8_supported=False,
-                                use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=None,
+                                     input_scale_ub=None,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index a5831810ca83..183c2417409a 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -7,8 +7,7 @@
 
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
-    requantize_with_max_scale)
+    Fp8LinearOp, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -22,7 +21,7 @@ class QuarkW8A8Fp8(QuarkScheme):
     def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
         self.out_dtype = torch.get_default_dtype()
 
     @classmethod
@@ -136,12 +135,9 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            out_dtype=self.out_dtype,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 0bed9b6fbdfe..1d7f5666566f 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -15,7 +15,8 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     _normalize_quant_group_shape, scaled_dequantize)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
+    CUTLASS_BLOCK_FP8_SUPPORTED, Fp8LinearOp, cutlass_block_fp8_supported,
+    cutlass_fp8_supported)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, is_navi
 
@@ -32,6 +33,8 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
+# TODO fix ROCm->Triton custom path:
+#  https://github.com/vllm-project/vllm/issues/14397
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -49,6 +52,7 @@ def apply_w8a8_block_fp8_linear(
     shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
                                   and weight.shape[1] % 128 == 0)
     if current_platform.is_rocm():
+        # TODO this is never used, as cutlass_block_fp8_supported is False
         scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
                          input_2d.shape[:-1])[::-1]
         scale_b_shape = (weight_scale.view(-1, 1)
@@ -104,43 +108,55 @@ def apply_w8a8_block_fp8_linear_fake(
 # Unify the interface between `apply_w8a8_block_fp8_linear` and
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally
-def apply_fp8_linear_generic(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_group_shape: Tuple[int, int],
-    weight_group_shape: Tuple[int, int],
-    input_scale: Optional[torch.Tensor] = None,  # static scale if one
-    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
-    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
-) -> torch.Tensor:
-    # View input as 2D matrix for fp8 methods
-    input = input.view(-1, input.shape[-1])
-
-    weight_group_shape = _normalize_quant_group_shape(\
-        weight, weight_group_shape)
-    input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
-
-    def is_dim_blocked(dim, shape, group_shape):
-        return group_shape < shape[dim] and group_shape > 1
-
-    if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
-     and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
-     input_group_shape == (1, weight_group_shape[1]):
-        return apply_w8a8_block_fp8_linear(
-            input,
-            weight,
-            list(weight_group_shape),
-            weight_scale,
-            cutlass_block_fp8_supported=cutlass_block_fp8_supported)
-    else:
-        # Despite having linear in the it doesn't conform to
-        # `torch.nn.functional.linear` which is defined as `input @ weight.T`
-        # so we explicitly transpose the weight matrix here
-        return apply_fp8_linear(input, weight.T, weight_scale.T,
-                    cutlass_fp8_supported=cutlass_fp8_supported,
-                         use_per_token_if_dynamic=\
-                             (input_group_shape == (1, input.shape[1])))
+# TODO(luka): unify this better
+#  https://github.com/vllm-project/vllm/issues/14397
+class Fp8LinearGenericOp:
+
+    def __init__(
+            self,
+            cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+            cutlass_block_fp8_supported: bool = cutlass_block_fp8_supported(),
+    ):
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_supported)
+
+    def apply(
+            self,
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            weight_scale: torch.Tensor,
+            input_group_shape: Tuple[int, int],
+            weight_group_shape: Tuple[int, int],
+            input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    ) -> torch.Tensor:
+        # View input as 2D matrix for fp8 methods
+        input = input.view(-1, input.shape[-1])
+
+        weight_group_shape = _normalize_quant_group_shape( \
+            weight, weight_group_shape)
+        input_group_shape = _normalize_quant_group_shape(
+            input, input_group_shape)
+
+        def is_dim_blocked(dim, shape, group_shape):
+            return group_shape < shape[dim] and group_shape > 1
+
+        if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+         and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+         input_group_shape == (1, weight_group_shape[1]):
+            return apply_w8a8_block_fp8_linear(
+                input,
+                weight,
+                list(weight_group_shape),
+                weight_scale,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported)
+        else:
+            # Despite having linear in the name it doesn't conform to
+            # `torch.nn.functional.linear` which is defined as
+            # `input @ weight.T` so we explicitly transpose the weight matrix
+            return self.fp8_linear.apply(input, weight.T, weight_scale.T,
+                             use_per_token_if_dynamic=\
+                                 (input_group_shape == (1, input.shape[1])))
 
 
 def input_to_float8(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 80416c1bc6eb..d1fb52ae09de 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -5,6 +5,7 @@
 import numpy
 import torch
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.platforms import current_platform
@@ -290,6 +291,23 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
     return output
 
 
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD or device.type != "cuda":
+        return False
+
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        return False
+
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    return max(m, 64) * n < 64 * 2048 and k >= 2048
+
+
 def apply_gptq_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,
@@ -307,6 +325,12 @@ def apply_gptq_marlin_linear(
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   weight,
                                   weight_scale,
@@ -320,6 +344,7 @@ def apply_gptq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
                                   has_zp=False,
+                                  use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
@@ -345,6 +370,12 @@ def apply_awq_marlin_linear(
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   weight,
                                   weight_scale,
@@ -358,6 +389,7 @@ def apply_awq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
                                   has_zp=True,
+                                  use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index e9357875ba7e..2555224cd2eb 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.model_executor.layers.tuned_gemm import tgemm
 from vllm.platforms import current_platform
 
@@ -121,137 +122,169 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
-def apply_fp8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    out_dtype: Optional[torch.dtype] = None,
-    input_scale: Optional[torch.Tensor] = None,
-    input_scale_ub: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
-    use_per_token_if_dynamic: bool = False,
-) -> torch.Tensor:
-    # ops.scaled_fp8_quant supports both dynamic and static quant.
-    #   If dynamic, layer.input_scale is None and x_scale computed from x.
-    #   If static, layer.input_scale is scalar and x_scale is input_scale.
-
-    # View input as 2D matrix for fp8 methods
-    input_2d = input.view(-1, input.shape[-1])
-    output_shape = [*input.shape[:-1], weight.shape[1]]
-
-    if out_dtype is None:
-        out_dtype = input.dtype
-
-    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
-    if cutlass_fp8_supported:
-        qinput, x_scale = ops.scaled_fp8_quant(
-            input_2d,
-            input_scale,
-            scale_ub=input_scale_ub,
-            use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-        # Fused GEMM_DQ
-        output = ops.cutlass_scaled_mm(qinput,
-                                       weight,
-                                       out_dtype=input.dtype,
-                                       scale_a=x_scale,
-                                       scale_b=weight_scale,
-                                       bias=bias)
-        return output.view(*output_shape)
-
-    # torch.scaled_mm supports per tensor weights + activations only
-    # so fallback to naive if per channel or per token
-    else:
+# TODO(luka): follow similar pattern for marlin and block-fp8-linear
+#  https://github.com/vllm-project/vllm/issues/14397
+class Fp8LinearOp:
+    """
+    This class executes a FP8 linear layer using cutlass if supported and
+    torch.scaled_mm otherwise.
+    It needs to be a class instead of a method so that config can be read
+    in the __init__ method, as reading config is not allowed inside forward.
+    """
+
+    def __init__(self,
+                 cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+                 use_per_token_if_dynamic: bool = False,
+                 pad_output: Optional[bool] = None):
+        self.cutlass_fp8_supported = cutlass_fp8_supported
+        self.use_per_token_if_dynamic = use_per_token_if_dynamic
+
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
         # This could change in the future.
-        if input.dtype != torch.float8_e4m3fnuz:
+        # We also don't pad when using torch.compile,
+        # as it breaks with dynamic shapes.
+        if pad_output is None:
+            config = get_current_vllm_config().compilation_config
+            pad_output = config.level < CompilationLevel.PIECEWISE
+        self.output_padding = 17 if pad_output else None
+
+    def apply(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: Optional[torch.dtype] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        input_scale_ub: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        # TODO(luka) remove this parameter in favor of __init__
+        use_per_token_if_dynamic: Optional[bool] = None
+    ) -> torch.Tensor:
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+        # View input as 2D matrix for fp8 methods
+        input_2d = input.view(-1, input.shape[-1])
+        output_shape = [*input.shape[:-1], weight.shape[1]]
+
+        # TODO(luka) this is here because currently MLA only decides this
+        #  during the forward method instead of in __init__.
+        if use_per_token_if_dynamic is None:
+            use_per_token_if_dynamic = self.use_per_token_if_dynamic
+
+        if out_dtype is None:
+            out_dtype = input.dtype
+
+        # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+        if self.cutlass_fp8_supported:
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
-                num_token_padding=17 if current_platform.is_cuda() else None,
+                scale_ub=input_scale_ub,
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
-        else:
-            qinput, x_scale = input_2d, input_scale
 
-        per_tensor_weights = (weight_scale.numel() == 1)
-        per_tensor_activations = (x_scale.numel() == 1)
-
-        if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            output = tgemm.scaled_mm(qinput,
-                                     weight,
-                                     out_dtype=out_dtype,
-                                     scale_a=x_scale,
-                                     scale_b=weight_scale,
-                                     bias=bias)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-
-            return torch.narrow(output, 0, 0,
-                                input_2d.shape[0]).view(*output_shape)
-
-        elif (use_per_token_if_dynamic and not per_tensor_weights
-              and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
-            # For now validated on ROCm platform
-            # fp8 rowwise scaling in torch._scaled_mm is introduced in
-            # https://github.com/pytorch/pytorch/pull/144432 using
-            # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
-            # For CUDA platform please validate if the
-            # torch._scaled_mm support rowwise scaled GEMM
-            # Fused GEMM_DQ Rowwise GEMM
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale.t(),
-                                      bias=bias)
-
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            output = output.view(*output_shape)
-            return output
-
+            output = ops.cutlass_scaled_mm(qinput,
+                                           weight,
+                                           out_dtype=input.dtype,
+                                           scale_a=x_scale,
+                                           scale_b=weight_scale,
+                                           bias=bias)
+            return output.view(*output_shape)
+
+        # torch.scaled_mm supports per tensor weights + activations only
+        # so fallback to naive if per channel or per token
         else:
-            # Fallback for channelwise case, where we use unfused DQ
-            # due to limitations with scaled_mm
-
-            # Symmetric quantized GEMM by definition computes the following:
-            #   C = (s_x * X) (s_w * W) + bias
-            # This is equivalent to dequantizing the weights and activations
-            # before applying a GEMM.
-            #
-            # In order to compute quantized operands, a quantized kernel
-            # will rewrite the above like so:
-            #   C = s_w * s_x * (X * W) + bias
-            #
-            # For the scaled_mm fallback case, we break this down, since it
-            # does not support s_w being a vector.
-
-            # GEMM
-            # This computes C = (X * W).
-            # Output in fp32 to allow subsequent ops to happen in-place
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      scale_a=TORCH_DEVICE_IDENTITY,
-                                      scale_b=TORCH_DEVICE_IDENTITY,
-                                      out_dtype=torch.float32)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-            # Unpad (undo num_token_padding)
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-            # DQ
-            # C = sw * sx * (X * W) + bias
-            output = output * x_scale * weight_scale.t()
-            if bias is not None:
-                output = output + bias
-            return output.to(dtype=input.dtype).view(*output_shape)
+            if input.dtype != torch.float8_e4m3fnuz:
+                # Maybe apply padding to output, see comment in __init__
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=self.output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic)
+            else:
+                qinput, x_scale = input_2d, input_scale
+
+            per_tensor_weights = (weight_scale.numel() == 1)
+            per_tensor_activations = (x_scale.numel() == 1)
+
+            if per_tensor_weights and per_tensor_activations:
+                # Fused GEMM_DQ
+                output = tgemm.scaled_mm(qinput,
+                                         weight,
+                                         out_dtype=out_dtype,
+                                         scale_a=x_scale,
+                                         scale_b=weight_scale,
+                                         bias=bias)
+                # A fix for discrepancy in scaled_mm which returns tuple
+                # for torch < 2.5 and a single value in torch >= 2.5
+                if type(output) is tuple and len(output) == 2:
+                    output = output[0]
+
+                return torch.narrow(output, 0, 0,
+                                    input_2d.shape[0]).view(*output_shape)
+
+            elif (use_per_token_if_dynamic and not per_tensor_weights
+                  and not per_tensor_activations
+                  and USE_ROWWISE_TORCH_SCALED_MM):
+                # For now validated on ROCm platform
+                # fp8 rowwise scaling in torch._scaled_mm is introduced in
+                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
+                # and ROCm 6.3, which only exists in torch 2.7 and above.
+                # For CUDA platform please validate if the
+                # torch._scaled_mm support rowwise scaled GEMM
+                # Fused GEMM_DQ Rowwise GEMM
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          out_dtype=input.dtype,
+                                          scale_a=x_scale,
+                                          scale_b=weight_scale.t(),
+                                          bias=bias)
+
+                output = torch.narrow(output, 0, 0, input_2d.shape[0])
+                output = output.view(*output_shape)
+                return output
+
+            else:
+                # Fallback for channelwise case, where we use unfused DQ
+                # due to limitations with scaled_mm
+
+                # Symmetric quantized GEMM by definition computes the following:
+                #   C = (s_x * X) (s_w * W) + bias
+                # This is equivalent to dequantizing the weights and activations
+                # before applying a GEMM.
+                #
+                # In order to compute quantized operands, a quantized kernel
+                # will rewrite the above like so:
+                #   C = s_w * s_x * (X * W) + bias
+                #
+                # For the scaled_mm fallback case, we break this down, since it
+                # does not support s_w being a vector.
+
+                # GEMM
+                # This computes C = (X * W).
+                # Output in fp32 to allow subsequent ops to happen in-place
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          scale_a=TORCH_DEVICE_IDENTITY,
+                                          scale_b=TORCH_DEVICE_IDENTITY,
+                                          out_dtype=torch.float32)
+                # A fix for discrepancy in scaled_mm which returns tuple
+                # for torch < 2.5 and a single value in torch >= 2.5
+                if type(output) is tuple and len(output) == 2:
+                    output = output[0]
+                # Unpad (undo num_token_padding)
+                output = torch.narrow(output, 0, 0, input_2d.shape[0])
+                x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
+
+                # DQ
+                # C = sw * sx * (X * W) + bias
+                output = output * x_scale * weight_scale.t()
+                if bias is not None:
+                    output = output + bias
+                return output.to(dtype=input.dtype).view(*output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index ce3ab80985bd..2f26bf5c365b 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.envs import VLLM_USE_ROCM_SKINNY_GEMM
 from vllm.platforms import current_platform
 from vllm.utils import is_mi250, is_navi
@@ -68,6 +69,8 @@ def create_ds(self):
         self.solids = solds
 
     def query_sol(self, m, n, k, bias, dtype):
+        if envs.VLLM_USE_V1:
+            return 0, 0
         return self.solids.get((m, n, k, bias, str(dtype)), (0, 0))
 
     def apply_skinny(self, m, n, k, inp_view, weights):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4f1092f68f50..bf226f661126 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -354,11 +354,18 @@ def _get_weights_iterator(
                 self.load_config.download_dir,
                 hf_folder,
                 hf_weights_files,
+                self.load_config.use_tqdm_on_load,
             )
         elif use_safetensors:
-            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+            weights_iterator = safetensors_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         else:
-            weights_iterator = pt_weights_iterator(hf_weights_files)
+            weights_iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
 
         if current_platform.is_tpu():
             # In PyTorch XLA, we should call `xm.mark_step` frequently so that
@@ -806,9 +813,15 @@ def _prepare_weights(self, model_name_or_path: str,
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:
-            iterator = safetensors_weights_iterator(hf_weights_files)
+            iterator = safetensors_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         else:
-            iterator = pt_weights_iterator(hf_weights_files)
+            iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         for org_name, param in iterator:
             # mapping weight names from transformers to vllm while preserving
             # original names.
@@ -1396,7 +1409,10 @@ def _get_weights_iterator(
             revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_weights_files = self._prepare_weights(model_or_path, revision)
-        return runai_safetensors_weights_iterator(hf_weights_files)
+        return runai_safetensors_weights_iterator(
+            hf_weights_files,
+            self.load_config.use_tqdm_on_load,
+        )
 
     def download_model(self, model_config: ModelConfig) -> None:
         """Download model if necessary"""
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 805f0cfc585e..cd2d427edbbd 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -2,7 +2,7 @@
 
 # ruff: noqa: SIM117
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import openvino as ov
 import torch
@@ -12,8 +12,8 @@
 from torch import nn
 
 import vllm.envs as envs
-from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
                                                          _prune_hidden_states)
@@ -24,7 +24,7 @@
 logger = init_logger(__name__)
 
 
-def _flattenize_inputs(inputs):
+def _flatten_inputs(inputs):
     """
     Helper function for making nested inputs flattens
     """
@@ -33,10 +33,9 @@ def _flattenize_inputs(inputs):
         if input_data is None:
             continue
         if isinstance(input_data, (list, tuple)):
-            flatten_inputs.extend(_flattenize_inputs(input_data))
+            flatten_inputs.extend(_flatten_inputs(input_data))
         elif isinstance(input_data, dict):
-            flatten_inputs.extend(_flattenize_inputs(list(
-                input_data.values())))
+            flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
         else:
             flatten_inputs.append(input_data)
     return flatten_inputs
@@ -147,15 +146,15 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
-        attn_metadata: OpenVINOAttentionMetadata,
+        kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
     ) -> torch.Tensor:
-        flatten_kv_cache = _flattenize_inputs(kv_caches)
+        flat_kv_caches = _flatten_inputs(kv_caches)
+        attn_metadata = get_forward_context().attn_metadata
 
         inputs = [
             input_ids,
             positions,
-            *flatten_kv_cache,
+            *flat_kv_caches,
             attn_metadata.past_lens,
             attn_metadata.subsequence_begins,
             attn_metadata.block_indices,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index d184079fb25d..926172a1daab 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -366,16 +366,22 @@ def filter_files_not_needed_for_inference(
 _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
 
 
+def enable_tqdm(use_tqdm_on_load: bool):
+    return use_tqdm_on_load and (not torch.distributed.is_initialized()
+                                 or torch.distributed.get_rank() == 0)
+
+
 def np_cache_weights_iterator(
-    model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
-    hf_weights_files: List[str]
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    hf_folder: str,
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model np files.
 
     Will dump the model weights to numpy files if they are not already dumped.
     """
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     # Convert the model weights from torch tensors to numpy arrays for
     # faster loading.
     np_folder = os.path.join(hf_folder, "np")
@@ -389,7 +395,7 @@ def np_cache_weights_iterator(
             for bin_file in tqdm(
                     hf_weights_files,
                     desc="Loading np_cache checkpoint shards",
-                    disable=not enable_tqdm,
+                    disable=not enable_tqdm(use_tqdm_on_load),
                     bar_format=_BAR_FORMAT,
             ):
                 state = torch.load(bin_file,
@@ -414,15 +420,14 @@ def np_cache_weights_iterator(
 
 
 def safetensors_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     for st_file in tqdm(
             hf_weights_files,
             desc="Loading safetensors checkpoint shards",
-            disable=not enable_tqdm,
+            disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
         with safe_open(st_file, framework="pt") as f:
@@ -432,16 +437,15 @@ def safetensors_weights_iterator(
 
 
 def runai_safetensors_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     with SafetensorsStreamer() as streamer:
         for st_file in tqdm(
                 hf_weights_files,
                 desc="Loading safetensors using Runai Model Streamer",
-                disable=not enable_tqdm,
+                disable=not enable_tqdm(use_tqdm_on_load),
                 bar_format=_BAR_FORMAT,
         ):
             streamer.stream_file(st_file)
@@ -449,15 +453,14 @@ def runai_safetensors_weights_iterator(
 
 
 def pt_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     for bin_file in tqdm(
             hf_weights_files,
             desc="Loading pt checkpoint shards",
-            disable=not enable_tqdm,
+            disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
         state = torch.load(bin_file, map_location="cpu", weights_only=True)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index e2d4a8de605b..065715cbde4e 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -370,7 +370,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 061a9a5bd2bc..de3512cf18d9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -46,7 +46,7 @@ class AriaImagePixelInputs(TypedDict):
     pixel_values: torch.Tensor
     pixel_mask: Optional[torch.Tensor]
     """
-    Shape: 
+    Shape:
         pixel_values: `(batch_size * num_images, num_channels, height, width)`
         pixel_mask: `(batch_size * num_images, height, width)`
     """
@@ -135,11 +135,11 @@ class AriaProjector(nn.Module):
         query numbers,
             e.g., {1225: 128, 4900: 256}. This allows for different query sizes
             based on image resolution.
-        embed_dim (int): Embedding dimension. 
-        num_heads (int): Number of attention heads. 
-        kv_dim (int): Dimension of key and value. 
-        ff_dim (int): Hidden dimension of the feed-forward network. 
-        output_dim (int): Output dimension. 
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        kv_dim (int): Dimension of key and value.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
         norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
 
     Outputs:
@@ -239,6 +239,7 @@ def __init__(
         self,
         config: AriaTextConfig,
         quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -254,6 +255,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             reduce_results=True,
+            prefix=f"{prefix}.experts",
         )
         self.shared_experts = LlamaMLP(
             config.hidden_size,
@@ -301,7 +303,9 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__(config, cache_config, quant_config, prefix)
-        self.mlp = AriaTextMoELayer(config, quant_config=quant_config)
+        self.mlp = AriaTextMoELayer(config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.mlp")
 
 
 class AriaTextModel(LlamaModel, SupportsQuant):
@@ -602,7 +606,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 4fb68e7b48da..7e2b7c862e5f 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -267,7 +267,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 82684dfa730e..109b65d92cf9 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -31,6 +31,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -169,7 +170,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
@@ -248,7 +249,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
@@ -299,14 +300,14 @@ def __init__(
                              f" and `num_heads`: {num_heads}).")
         self.scaling = self.head_dim**-0.5
 
-        self.qkv_proj = QKVParallelLinear(
-            self.d_model,
-            self.d_model // self.total_num_heads,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-        )
+        # TP sharding sizes is accounted for within "*Parallel" layers.
+        self.qkv_proj = QKVCrossParallelLinear(self.d_model,
+                                               self.d_model //
+                                               self.total_num_heads,
+                                               self.total_num_heads,
+                                               self.total_num_kv_heads,
+                                               bias,
+                                               quant_config=quant_config)
 
         self.out_proj = RowParallelLinear(
             embed_dim,
@@ -327,10 +328,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-
+        self.num_kv_heads = self.num_heads  # No GQA in bart
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
@@ -347,18 +345,7 @@ def forward(
     ) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
 
-        # (afeldman-nm 2024/07/22) TODO:
-        # Need a more efficient solution for q/k/v
-        qkv_dec, _ = self.qkv_proj(decoder_hidden_states)
-        q, _, _ = qkv_dec.split([self.q_size, self.kv_size, self.kv_size],
-                                dim=-1)
-        if encoder_hidden_states is None:
-            k = None
-            v = None
-        else:
-            qkv_enc, _ = self.qkv_proj(encoder_hidden_states)
-            _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
-                                    dim=-1)
+        q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states)
 
         attn_output = self.attn(q, k, v)
 
@@ -725,7 +712,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8457f6294460..d7eaac2563f6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -628,7 +628,9 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9d597e240951..68284a018af8 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -851,7 +851,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
@@ -986,7 +985,9 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 7830dd4ce2ec..b66529860bc2 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -65,6 +65,7 @@ def __init__(
         config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
     ):
         super().__init__(
             num_experts=config.ffn_config.moe_num_experts,
@@ -76,6 +77,7 @@ def __init__(
             renormalize=True,
             quant_config=quant_config,
             tp_size=get_tensor_model_parallel_world_size(),
+            prefix=prefix,
         )
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -139,6 +141,7 @@ def __init__(
         config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -150,7 +153,8 @@ def __init__(
 
         self.experts = DbrxExperts(config=config,
                                    quant_config=quant_config,
-                                   params_dtype=self.params_dtype)
+                                   params_dtype=self.params_dtype,
+                                   prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -291,7 +295,7 @@ def __init__(
             cache_config,
             quant_config,
             prefix=f"{prefix}.norm_attn_norm")
-        self.ffn = DbrxMoE(config, quant_config)
+        self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index c04e7a02bae2..f0212f37657a 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -339,7 +339,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 978df1c94a3c..428420f91a1a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -145,6 +145,7 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
             )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -585,7 +586,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 3d2e452bb50e..fd5d5a564b5e 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -14,7 +14,6 @@
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -25,8 +24,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -42,8 +41,6 @@
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 # The image token id may be various
 _IMAGE_TOKEN = "<image>"
 
@@ -216,30 +213,6 @@ def get_dummy_processor_inputs(
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
 
-    def __init__(
-            self,
-            info: DeepseekVL2ProcessingInfo,
-            dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
-            *,
-            cache: Optional[ProcessingCache] = None,
-            enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] > 2:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "image limit larger than 2.")
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -316,6 +289,31 @@ def get_replacement_deepseek_vl2(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        # The processor logic is different for len(images) <= 2 vs > 2
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 2:
+            # This code path corresponds to the cache being disabled
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     DeepseekVL2MultiModalProcessor,
@@ -606,7 +604,9 @@ def _process_image_input(
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 79939f6f40e4..7d01dd37826a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -313,7 +313,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 7a8510379455..e892a1a4fc66 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -592,7 +592,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
@@ -1037,7 +1036,9 @@ def _process_image_input(
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 581ec54b2cab..51c79ba846c9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -255,7 +255,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.text_config.vocab_size
         self.image_token_id = _IMAGE_TOKEN_ID
         self.image_feature_size = config.patch_size**2 * config.num_channels
@@ -327,7 +326,9 @@ def _process_image_input(
             image_patches_flat)
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 2abce39c9cfb..c3baeac1b3ac 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -620,7 +620,9 @@ def _process_image_input(
 
         return self.transformer.vision(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 201e15d3a30f..eba8207d2cd4 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -260,7 +260,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 9b56874a8add..5152539c68f6 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -252,7 +252,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
new file mode 100644
index 000000000000..7e2e4cdcbfa3
--- /dev/null
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only GraniteMoeShared model.
+
+The architecture is the same as granitemoe but with the addition of shared
+experts.
+"""
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers.models.granitemoeshared import GraniteMoeSharedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from . import mixtral
+from .granitemoe import GraniteMoeAttention, GraniteMoeMoE
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import make_layers, maybe_prefix
+
+
+class GraniteMoeSharedMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.input_linear = MergedColumnParallelLinear(
+            input_size=self.input_size,
+            output_sizes=[self.hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.input_linear")
+        self.output_linear = RowParallelLinear(
+            self.hidden_size,
+            self.input_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_linear")
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.input_linear(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier)
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
+        self.shared_mlp = None if \
+            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            else GraniteMoeSharedMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_mlp"
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            hidden_states = self.block_sparse_moe(hidden_states)
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            moe_hidden_states = hidden_states.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+            del moe_hidden_states
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+@support_torch_compile
+class GraniteMoeSharedModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=quant_config,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteMoeSharedDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.embedding_multiplier
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
+                                           prefix=maybe_prefix(
+                                               prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=1 /
+                                                self.config.logits_scaling)
+
+        self.sampler = get_sampler()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
+                pass
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralForCausalLM.load_weights(self,
+                                                       new_weights.items())
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index d336d7521a27..e23765cc4fb5 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -8,21 +8,19 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from PIL import Image
 from transformers import PretrainedConfig
 
-from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -32,8 +30,6 @@
                        InternVLMultiModalProcessor, build_transform,
                        find_closest_aspect_ratio, get_internvl_target_ratios)
 
-logger = init_logger(__name__)
-
 
 def resolve_h2ovl_min_max_num(
     *,
@@ -465,29 +461,6 @@ def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
 
-    def __init__(self,
-                 info: H2OVLProcessingInfo,
-                 dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]",
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] >= 2:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "multi-image support enabled.")
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -543,6 +516,31 @@ def get_replacement_internvl(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        # The processor logic is different for len(images) <= 1 vs > 1
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 1:
+            # This code path corresponds to the cache being disabled
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     H2OVLMultiModalProcessor,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 286a75339d20..19d5a4c25997 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -404,7 +404,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
         self.vision_model = Idefics3VisionTransformer(
             config.vision_config,
@@ -617,7 +616,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self.model._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index fb3ceb005295..43196bf544e8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -4,6 +4,7 @@
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
+from torch import Tensor
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -15,12 +16,11 @@
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
-    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
-T = TypeVar("T", default="NestedTensors")
+T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]])
 
 
 @runtime_checkable
@@ -36,7 +36,7 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+    def get_multimodal_embeddings(self, **kwargs) -> T:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
@@ -59,18 +59,18 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
         attn_metadata: Optional["AttentionMetadata"] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         ...
 
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         """
         Returns the input embeddings merged from the text embeddings from 
         input_ids and the multimodal embeddings generated from multimodal 
@@ -210,7 +210,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         """
         Accept :class:`IntermediateTensors` when PP rank > 0.
 
@@ -237,7 +237,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         ...
 
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 41ca399b9efb..520b85c0cdfb 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -261,7 +261,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.tok_embeddings = VocabParallelEmbedding(
             config.vocab_size,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 48c2eb8c9f6e..1aa8455bad82 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -904,7 +904,9 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         else:
             self.visual_token_mask = None
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 58eccd6a6b87..11b863ded45d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -47,7 +47,8 @@ def __init__(self,
                  top_k: Optional[int] = None,
                  params_dtype: Optional[torch.dtype] = None,
                  tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.num_total_experts = num_experts or config.num_experts
         self.top_k = top_k or config.num_experts_per_tok
@@ -70,7 +71,8 @@ def __init__(self,
                                 reduce_results=True,
                                 renormalize=False,
                                 use_grouped_topk=False,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -92,13 +94,15 @@ def __init__(self,
                  config: JambaConfig,
                  params_dtype: Optional[torch.dtype] = None,
                  tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__(config,
                          num_experts=1,
                          top_k=1,
                          params_dtype=params_dtype,
                          tp_size=tp_size,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         prefix=prefix)
 
 
 class JambaMambaDecoderLayer(nn.Module):
@@ -109,6 +113,7 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
+                 prefix: str = "",
                  **kwargs) -> None:
         super().__init__()
         self.config = config
@@ -129,7 +134,9 @@ def __init__(self,
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -211,7 +218,9 @@ def __init__(self,
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -271,7 +280,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 43d4c28aa3f9..f6befad76602 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -341,7 +341,6 @@ def __init__(self,
 
         self.config = config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8318a496e608..e83dfd320bb6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
-                    TypedDict, TypeVar, Union)
+                    TypedDict, TypeVar, Union, cast)
 
 import torch
 import torch.nn as nn
@@ -35,6 +35,7 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -56,6 +57,25 @@ class LlavaImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_crops, num_patch)`
+    """
+
+    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_embeds)`
+    """
+
+    num_crops: torch.Tensor
+    """Shape: `(batch_size, num_images)`"""
+
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -65,6 +85,25 @@ class LlavaImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_crops, num_patch)`
+    """
+
+    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_embeds)`
+    """
+
+    num_crops: torch.Tensor
+    """Shape: `(batch_size, num_images)`"""
+
 
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
 
@@ -317,6 +356,26 @@ def _call_hf_processor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
+            hf_config = self.info.get_hf_config()
+
+            tile_sizes = [
+                get_pixtral_hf_image_feature_grid_size(
+                    hf_config.vision_config,
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2])
+                for pixel_value in processed_outputs["pixel_values"]
+            ]
+            num_crops = torch.tensor([(ncols + 1) * nrows
+                                      for ncols, nrows in tile_sizes])
+            # Each image may result to masks of different sizes, so we need to
+            # flatten the list and later use `num_crops` to get per-image masks.
+            embed_is_patch = torch.tensor(
+                flatten_2d_lists([([True] * ncols + [False]) * nrows
+                                  for ncols, nrows in tile_sizes]))
+            processed_outputs["num_crops"] = num_crops
+            processed_outputs["embed_is_patch"] = embed_is_patch
+            processed_outputs["feat_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -324,7 +383,13 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_crops = hf_inputs.get("num_crops", torch.empty(0)).view(-1)
         return dict(
+            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            embed_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            num_crops=MultiModalFieldConfig.batched("image"),
             pixel_values=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -562,6 +627,23 @@ def _parse_and_validate_image_input(
         if pixel_values is None and image_embeds is None:
             return None
 
+        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        if feat_is_patch is not None and not isinstance(
+                feat_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of feat_is_patch. "
+                             f"Got type: {type(feat_is_patch)}")
+
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if embed_is_patch is not None and not isinstance(
+                embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        num_crops = kwargs.pop("num_crops", None)
+        if num_crops is not None and not isinstance(num_crops, torch.Tensor):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+
         if pixel_values is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
@@ -571,12 +653,18 @@ def _parse_and_validate_image_input(
                 return LlavaImagePixelInputs(
                     type="pixel_values",
                     data=flatten_bn(pixel_values),
+                    feat_is_patch=feat_is_patch,
+                    embed_is_patch=embed_is_patch,
+                    num_crops=num_crops,
                 )
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(pixel_values, concat=True)),
+                feat_is_patch=feat_is_patch,
+                embed_is_patch=embed_is_patch,
+                num_crops=num_crops,
             )
 
         if image_embeds is not None:
@@ -587,6 +675,9 @@ def _parse_and_validate_image_input(
             return LlavaImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
+                feat_is_patch=feat_is_patch,
+                embed_is_patch=embed_is_patch,
+                num_crops=num_crops,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -633,14 +724,74 @@ def _process_image_input(self,
 
         assert self.vision_tower is not None
         image_features = self._process_image_pixels(image_input)
-        return self.multi_modal_projector(image_features)
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features)
+
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def _get_mm_embeds(
+            self,
+            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
+            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
+            num_crops: torch.Tensor,  # Shape: (num_images,)
+            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
+    ) -> list[torch.Tensor]:
+        """Scatter the patch features into a contiguous tensor that corresponds
+        to the embedding tokens defined by the multimodal processor.
+
+        Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
+        """
+
+        # Insert columns of nan values according to `feat_is_patch`. This work
+        # ideally should be done in `_process_image_input`, but
+        # `_process_image_input` is used in both V0 and V1 path. It's safer to
+        # put the logic here.
+        # FIXME: Move this logic to `_process_image_input` when v0 is
+        # deprecated. Merge this function with `Molmo._get_mm_embeds`.
+        feat_is_patch = feat_is_patch.view(-1)
+        embed_is_patch = embed_is_patch.view(-1)
+        expanded_embedding = torch.full(
+            (sum(num_crops), *features.shape[1:]),
+            torch.nan,
+            dtype=features.dtype).to(features.device)
+        expanded_embedding[feat_is_patch] = features
+
+        num_crops_per_image = num_crops.tolist()
+        feats_per_image = expanded_embedding.split(num_crops_per_image)
+        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
+
+        embed_dim = expanded_embedding.shape[-1]
+        num_embeds = embed_is_patch.shape[0]
+
+        embeds_in_batch = list[torch.Tensor]()
+        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
+            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
+            embeds[embed_is_patch] = feats[f_is_patch]
+            embeds_in_batch.append(embeds)
+
+        return embeds_in_batch
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
         vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+        if kwargs.get("v0_path", False):
+            return vision_embeddings
+        else:
+            nested_emb = [
+                self._get_mm_embeds(*args) for args in zip(
+                    vision_embeddings, image_input["feat_is_patch"],
+                    image_input["num_crops"], image_input["embed_is_patch"])
+            ]
+            return flatten_2d_lists(nested_emb)
 
     def get_input_embeddings(
         self,
@@ -649,8 +800,15 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
+            # Extract the patch tokens
+            patch_embeddings = json_map_leaves(
+                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+                cast(JSONTree[torch.Tensor], multimodal_embeddings),
+            )
+
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
+                input_ids, inputs_embeds, cast(NestedTensors,
+                                               patch_embeddings),
                 self.config.image_token_index)
         return inputs_embeds
 
@@ -703,6 +861,7 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
@@ -765,6 +924,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -775,7 +935,8 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                               return_mm_hashes)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 6a050d7798a2..04b0f2910292 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -479,7 +479,9 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index ca9406657df5..d974c3d22409 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -133,7 +133,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
@@ -420,7 +420,9 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e87ef24ce2ca..f41f45e3e409 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -206,8 +206,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 46b9182f2d79..7a525ad8e494 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -90,7 +90,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         is_lora_enabled = bool(lora_config)
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 34e1f3927a9a..cf03396a9ca9 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -365,7 +365,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index f35c230c0cea..bf6c38d27963 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -201,9 +201,9 @@ def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
-        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
+        max_audios = mm_config.get_limit_per_prompt("audio")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1816bf5d008d..48f0c09cdfb3 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -446,8 +446,8 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
@@ -780,6 +780,7 @@ def apply(
         prompt: Union[str, List[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         supported_mm_modalities = self.info.get_supported_mm_modalities()
         if isinstance(prompt, list):
@@ -791,7 +792,8 @@ def apply(
                 [index for index, m in enumerate(matches) if m == modality])
             for modality in supported_mm_modalities
         }
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                               return_mm_hashes)
         # Exclude <image_id>x</image_id> from placeholders
         if "image" in result["mm_placeholders"] and \
             self.info.get_model_version() == (2, 6):
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c8dea557e571..0ae7688779b3 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -71,6 +71,7 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  tp_size: Optional[int] = None,
+                 dp_size: Optional[int] = None,
                  prefix: str = ""):
         super().__init__()
         self.hidden_size = hidden_size
@@ -93,6 +94,7 @@ def __init__(self,
                                 renormalize=True,
                                 quant_config=quant_config,
                                 tp_size=tp_size,
+                                dp_size=dp_size,
                                 prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -254,7 +256,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 21b52d9f54c7..8a893b6d858d 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -302,7 +302,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2a829bf0e61e..45f5dea08521 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -175,8 +175,10 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                                  return_mm_hashes)
 
         # Check that the number of image tokens in the decoder prompt matches
         # the number of images provided in mm_data
@@ -796,29 +798,40 @@ def __init__(
         self.config = config
         self.pipeline_parallel_rank = get_pp_group().rank_in_group
         self.tensor_parallel_size = get_tp_group().world_size
-        self.num_heads = self.config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
         self.num_local_heads = self.num_heads // self.tensor_parallel_size
-        self.num_key_value_heads = self.config.num_key_value_heads
         self.num_local_key_value_heads = \
             self.num_key_value_heads // self.tensor_parallel_size
-        self.dropout = config.dropout
         self.hidden_size = config.hidden_size
         self.head_dim = config.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
         self.layer_idx = layer_idx
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.q_local_size = self.num_local_heads * self.head_dim
         self.kv_local_size = self.num_local_key_value_heads * self.head_dim
 
-        # TODO: change to Q/KV separate linear after #7448 is merged
-        self.qkv_proj = QKVParallelLinear(
+        # TODO(Isotr0py): Use QKVCrossParallelLinear when it supports
+        # quantization
+        self.q_proj = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_proj = QKVParallelLinear(
             self.hidden_size,
             self.head_dim,
-            self.num_heads,
-            self.num_key_value_heads,
+            total_num_heads=0,
+            total_num_kv_heads=self.num_key_value_heads,
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
         )
+
         self.o_proj = RowParallelLinear(
             self.num_heads * self.head_dim,
             self.hidden_size,
@@ -849,21 +862,16 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         cross_attention_states: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        qkv_dec, _ = self.qkv_proj(hidden_states)
-        q, _, _ = qkv_dec.split(
-            [self.q_local_size, self.kv_local_size, self.kv_local_size],
-            dim=-1)
-        if cross_attention_states is None:
-            k = None
-            v = None
-        else:
-            qkv_enc, _ = self.qkv_proj(cross_attention_states)
-            _, k, v = qkv_enc.split(
-                [self.q_local_size, self.kv_local_size, self.kv_local_size],
-                dim=-1)
+        q, _ = self.q_proj(hidden_states)
+        if cross_attention_states is not None:
+            kv, _ = self.kv_proj(cross_attention_states)
+            k, v = kv.split([self.kv_local_size, self.kv_local_size], dim=-1)
             k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
             v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
             k = self.k_norm(k)
+        else:
+            k = v = None
+
         q = q.view(-1, self.num_local_heads, self.head_dim)
         q = self.q_norm(q)
 
@@ -887,6 +895,7 @@ def _attention_with_mask(
         kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank]
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
         # Skip writing kv-cache for the initial profiling run.
+        # TODO (NickLucche) replace with custom attn bias and use standard attn
         if len(kv_cache.shape) > 1:
             i = torch.ones(1, dtype=torch.float32)
             if self.attn.backend in (_Backend.FLASH_ATTN,
@@ -1031,7 +1040,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8,
                                                    config.hidden_size)
@@ -1153,8 +1161,13 @@ def forward(
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsV0Only):
     packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "self_attn.qkv_proj": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+        ],
+        "cross_attn.kv_proj": ["cross_attn.k_proj", "cross_attn.v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -1424,9 +1437,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".cross_attn.kv_proj", ".cross_attn.k_proj", "k"),
+            (".cross_attn.kv_proj", ".cross_attn.v_proj", "v"),
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 21158f7e5802..554080533059 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -50,7 +50,7 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import JSONTree, json_map_leaves
+from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -1484,8 +1484,8 @@ def _parse_and_validate_image_input(
 
         img_patch_id = kwargs.pop("img_patch_id", None)
         if not isinstance(img_patch_id, torch.Tensor):
-            raise ValueError("Incorrect type of num_crops. "
-                             f"Got type: {type(num_crops)}")
+            raise ValueError("Incorrect type of img_patch_id. "
+                             f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
         return MolmoImageInputs(
@@ -1576,14 +1576,16 @@ def _get_mm_embeds(
 
         return embeds_in_batch
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
         image_features = self._process_image_input(image_input)
 
-        return [
+        nested_embeds = [
             self._get_mm_embeds(*args) for args in zip(
                 image_features,
                 image_input["feat_is_patch"],
@@ -1591,6 +1593,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
                 image_input["embed_is_patch"],
             )
         ]
+        return flatten_2d_lists(nested_embeds)
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 3b86b91465ca..a2b494949689 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -300,7 +300,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index e27ff5deace2..6cf3f1f82645 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -80,7 +80,8 @@ def __init__(self,
                                 reduce_results=True,
                                 renormalize=False,
                                 quant_config=quant_config,
-                                tp_size=tp_size)
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -212,6 +213,7 @@ def __init__(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
@@ -252,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index e4775478a54d..d4c2b4c48d90 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -200,7 +200,6 @@ def __init__(
     ):
         super().__init__()
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.vocab_size = config.vocab_size
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 6668ede91eec..0b42666e02d6 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -217,7 +217,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9a1398c28dbc..f3dc87854cba 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -5,22 +5,26 @@
 
 import torch
 from torch import nn
-from transformers import PaliGemmaConfig
+from transformers import BatchFeature, PaliGemmaConfig
 
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsMultiModal, SupportsPP, SupportsV0Only
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
+from .interfaces import SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel, get_max_siglip_image_tokens
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -46,97 +50,152 @@ class PaliGemmaImageEmbeddingInputs(TypedDict):
                              PaliGemmaImageEmbeddingInputs]
 
 
-def get_max_paligemma_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
-    vision_config = hf_config.vision_config
-
-    return get_max_siglip_image_tokens(vision_config)
-
-
-def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_siglip(
-        vision_config,
-        seq_len,
-        num_images,
-        image_token_id=hf_config.image_token_index,
-    )
-
-    mm_data = dummy_image_for_siglip(vision_config, num_images)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def input_processor_for_paligemma(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
+class PaliGemmaMultiModalProjector(nn.Module):
 
-    """
-    The correct prompt format needs to be:
-    '<image>' * image_feature_size + '<bos>' + prompt + '\n'
+    def __init__(self, vision_hidden_size: int, projection_dim: int):
+        super().__init__()
 
-    See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
-    """ # noqa
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
 
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear(image_features)
+        return hidden_states
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
 
-    tokenizer = cached_tokenizer_from_config(model_config)
-    image_feature_size = hf_config.text_config.num_image_tokens
-    image_token_str = tokenizer.decode(hf_config.image_token_index)
-    bos_token = tokenizer.decode(hf_config.bos_token_id)
-    image_token_str_pad = image_token_str * image_feature_size
-    image_token_ids_pad = [hf_config.image_token_index] * image_feature_size
+class PaliGemmaProcessingInfo(BaseProcessingInfo):
 
-    orig_prompt = inputs.get("prompt")
-    orig_prompt_ids = inputs.get("prompt_token_ids")
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(PaliGemmaConfig)
 
-    if orig_prompt is not None and image_token_str in orig_prompt:
-        logger.warning(
-            "The image token '%s' was detected in the prompt and "
-            "will be removed. Please follow the proper prompt format"
-            " documented on HuggingFace.", image_token_str)
-        orig_prompt = orig_prompt.replace(image_token_str, "")
-        orig_prompt_ids.remove(hf_config.image_token_index)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-    new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
 
-    # The PaliGemma 2 tokenizer does not include a starting BOS token
-    if orig_prompt_ids[0] != hf_config.bos_token_id:
-        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        return get_max_siglip_image_tokens(vision_config)
 
-    new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+class PaliGemmaDummyInputsBuilder(
+        BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
 
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        max_image_size = vision_config.image_size
+
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
-class PaliGemmaMultiModalProjector(nn.Module):
 
-    def __init__(self, vision_hidden_size: int, projection_dim: int):
-        super().__init__()
+class PaliGemmaMultiModalProcessor(
+        BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
 
-        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+        if not mm_data:
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.linear(image_features)
-        return hidden_states
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        tokenizer = self.info.get_tokenizer()
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
+
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        # Paligemma 1 and 2 have different tokenizer.add_bos_token
+        # Insert <image>*n + <bos> after <bos> for Paligemma 1
+        # Insert <image>*n + <bos> for Paligemma 2
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.prefix(
+                    [bos_token_id] if tokenizer.add_bos_token else []),
+                insertion=PromptUpdateDetails(
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+
+        tokenizer = self.info.get_tokenizer()
+        newline_prompt = "\n"
+        newline_token_id = tokenizer.encode(newline_prompt)[-1]  # 108
+        # Force to add newline at the end of prompt for paligemma's format
+        # This step can NOT be replacemented by current PromptUpdate methods
+        if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
+            prompt_token_ids.append(newline_token_id)
+            mm_inputs["prompt_token_ids"] = prompt_token_ids
+            mm_inputs["prompt"] += newline_prompt
+
+        return mm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PaliGemmaMultiModalProcessor,
+    info=PaliGemmaProcessingInfo,
+    dummy_inputs=PaliGemmaDummyInputsBuilder)
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP, SupportsV0Only):
+                                        SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -263,7 +322,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fd4b3c70211..06fa5c5e0199 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -648,7 +648,9 @@ def _process_image_input(
 
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
new file mode 100644
index 000000000000..89abfc5919a9
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm.py
@@ -0,0 +1,1782 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+import re
+from functools import lru_cache
+from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import numpy as np
+import scipy.signal
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
+from vllm.inputs.data import TokenInputs, token_inputs
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
+from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+
+from .interfaces import SupportsLoRA, SupportsMultiModal
+from .phi4mm_audio import AudioEmbedding
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .vision_siglip_navit import get_siglip_vision_model
+
+# <|endoftext10|> (see vocab.json in hf model)
+_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
+# <|endoftext11|>
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+
+_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
+DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
+
+DYNAMIC_HD = 16
+AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
+IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
+
+SIGLIP_NAME = "siglip-so400m-patch14-448"
+VISION_ENCODER_TO_PROCESSING_CONFIG = {
+    'siglip-so400m-patch14-448': {
+        'dynamic_hd': 16,
+        'vit_image_size': 448,
+        'vit_patch_size': 14,
+        'token_compression_factor': 2,
+    },
+}
+logger = logging.get_logger(__name__)
+# This is a workaround to prevent text (user input) + audio + image
+# from being used in the same prompt.
+# It includes token ids for "/n" and tokens in added_tokens_decoder
+# from the tokenizer_confg.json file.
+NON_USER_INPUT_TOKENS = {
+    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
+    200023, 200024, 200025, 200026, 200027, 200028
+}
+
+
+def get_max_dummy_image(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+
+    max_side = vit_image_size * dynamic_hd_size
+    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
+    return dummy_image
+
+
+# image token length
+def get_max_phi4mm_image_tokens(ctx: InputContext):
+    dummy_image = get_max_dummy_image(ctx)
+
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+    token_compression_factor = prepro_config['token_compression_factor']
+
+    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
+                                                 vit_image_size,
+                                                 vit_patch_size,
+                                                 token_compression_factor)
+    return image_num_tokens
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def _find_target_aspect_ratio(image, image_size, max_num, min_num):
+    orig_width, orig_height = image.size
+
+    w_crop_num = math.ceil(orig_width / float(image_size))
+    h_crop_num = math.ceil(orig_height / float(image_size))
+    if w_crop_num * h_crop_num > max_num:
+        aspect_ratio = orig_width / orig_height
+
+        # calculate the existing image aspect ratio
+        target_ratios = set((i, j) for i in range(1, max_num + 1)
+                            for j in range(1, max_num + 1)
+                            if i * j <= max_num and i * j >= min_num)
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
+    else:
+        target_width = image_size * w_crop_num
+        target_height = image_size * h_crop_num
+        target_aspect_ratio = (w_crop_num, h_crop_num)
+    return target_aspect_ratio, target_height, target_width
+
+
+def _get_padding_size(image, target_height, target_width):
+    orig_width, orig_height = image.size
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+
+    if ratio_width < ratio_height:
+        padding_width = 0
+        padding_height = target_height - int(orig_height * ratio_width)
+    else:
+        padding_width = target_width - int(orig_width * ratio_height)
+        padding_height = 0
+    return padding_height, padding_width
+
+
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=12,
+                       image_size=384,
+                       mask_size=27):
+    target_aspect_ratio, target_height, target_width =\
+          _find_target_aspect_ratio(
+        image, image_size, max_num, min_num)
+    padding_height, padding_width = _get_padding_size(image, target_height,
+                                                      target_width)
+
+    # Calculate the ratio
+    orig_width, orig_height = image.size
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+    if ratio_width < ratio_height:
+        new_size = (target_width, int(orig_height * ratio_width))
+    else:
+        new_size = (int(orig_width * ratio_height), target_height)
+
+    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
+                                 int(mask_size * target_aspect_ratio[0])))
+    if padding_width >= 14:
+        attention_mask[:, -math.floor(padding_width / 14):] = 0
+    if padding_height >= 14:
+        attention_mask[-math.floor(padding_height / 14):, :] = 0
+    assert attention_mask.sum(
+    ) > 0, f'attention mask is empty {attention_mask}'
+
+    if min(new_size[1], target_height) < 10 or min(new_size[0],
+                                                   target_width) < 10:
+        raise ValueError(f'the aspect ratio is very extreme {new_size}')
+
+    image = T.functional.resize(
+        image,
+        [new_size[1], new_size[0]],
+    )
+
+    resized_img = T.functional.pad(image,
+                                   [0, 0, padding_width, padding_height],
+                                   fill=[255, 255, 255])
+
+    return resized_img, attention_mask
+
+
+def pad_to_max_num_crops(images, max_crops=5):
+    """
+    images: B x 3 x H x W, B<=max_crops
+    """
+    B, _, H, W = images.shape
+    if max_crops > B:
+        pad = torch.zeros(max_crops - B,
+                          3,
+                          H,
+                          W,
+                          dtype=images.dtype,
+                          device=images.device)
+        images = torch.cat([images, pad], dim=0)
+    return images
+
+
+def pad_mask_to_max_num_crops(masks, max_crops=5):
+    B, H, W = masks.shape
+    if max_crops > B:
+        pad = torch.ones(max_crops - B,
+                         H,
+                         W,
+                         dtype=masks.dtype,
+                         device=masks.device)
+        masks = torch.cat([masks, pad], dim=0)
+    return masks
+
+
+def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
+
+    # Basic settings.
+    img_processor = T.Compose([
+        T.ToTensor(),
+        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    ])
+    # Dynamic HD
+    base_resolution = vit_resolution
+    images = [image.convert('RGB') for image in images]
+    # cover 384 and 448 resolution
+    mask_resolution = base_resolution // vit_patch_size
+    elems, image_attention_masks = [], []
+    for im in images:
+        elem, attention_mask = dynamic_preprocess(im,
+                                                  max_num=dynamic_hd_size,
+                                                  image_size=base_resolution,
+                                                  mask_size=mask_resolution)
+        elems.append(elem)
+        image_attention_masks.append(attention_mask)
+    hd_images = [img_processor(im) for im in elems]
+    global_image = [
+        torch.nn.functional.interpolate(
+            im.unsqueeze(0).float(),
+            size=(base_resolution, base_resolution),
+            mode='bicubic',
+        ).to(im.dtype) for im in hd_images
+    ]
+    shapes = [[im.size(1), im.size(2)] for im in hd_images]
+    mask_shapes = [[mask.size(0), mask.size(1)]
+                   for mask in image_attention_masks]
+    global_attention_mask = [
+        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
+    ]
+    hd_images_reshape = [
+        im.reshape(1, 3, h // base_resolution, base_resolution,
+                   w // base_resolution, base_resolution).permute(
+                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
+                                                 base_resolution).contiguous()
+        for im, (h, w) in zip(hd_images, shapes)
+    ]
+    attention_masks_reshape = [
+        mask.reshape(1, h // mask_resolution, mask_resolution,
+                     w // mask_resolution, mask_resolution).permute(
+                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
+                                                mask_resolution).contiguous()
+        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
+    ]
+    # NOTE token compression is hard coded here, and odd numbers seems to fail
+    downsample_attention_masks = [
+        mask[:, 0::2,
+             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
+                           mask_resolution // 2 + mask_resolution % 2,
+                           mask_resolution // 2 + mask_resolution % 2).permute(
+                               0, 1, 3, 2, 4)
+        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
+    ]
+    downsample_attention_masks = [
+        mask.reshape(mask.size(1) * mask.size(2),
+                     mask.size(3) * mask.size(4))
+        for mask in downsample_attention_masks
+    ]
+    # NOTE hard coded number of tokens
+    num_img_tokens = [
+        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
+        for mask in downsample_attention_masks
+    ]
+
+    hd_images_reshape = [
+        torch.cat([_global_image] + [_im], dim=0)
+        for _global_image, _im in zip(global_image, hd_images_reshape)
+    ]
+    hd_masks_reshape = [
+        torch.cat([_global_mask] + [_mask],
+                  dim=0) for _global_mask, _mask in zip(
+                      global_attention_mask, attention_masks_reshape)
+    ]
+    max_crops = max([img.size(0) for img in hd_images_reshape])
+    image_transformed = [
+        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
+    ]
+    image_transformed = torch.stack(image_transformed, dim=0)
+    mask_transformed = [
+        pad_mask_to_max_num_crops(mask, max_crops) \
+            for mask in hd_masks_reshape
+    ]
+    mask_transformed = torch.stack(mask_transformed, dim=0)
+
+    returned_input_image_embeds = image_transformed
+    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+    returned_image_attention_mask = mask_transformed
+    returned_num_img_tokens = num_img_tokens
+
+    data = {
+        "pixel_values": returned_input_image_embeds,
+        "image_sizes": returned_image_sizes,
+        "image_attention_mask": returned_image_attention_mask,
+        "num_img_tokens": returned_num_img_tokens,
+    }
+    return data
+
+
+class Phi4MMImageEncoder(nn.Module):
+    """Image embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig],
+                 prefix: str = "",
+                 model_dir: str = "") -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        # layer_idx to output the img features
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get(
+                'type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+
+        self.img_processor = get_siglip_vision_model(
+            _flash_attn_2_enabled=True)
+
+        pe_weight = self.img_processor.embeddings.position_embedding.weight
+        L, D = pe_weight.size()
+        H = int(math.sqrt(L))
+        assert H**2 == L, f'position embedding size {L} is not square'
+        if H % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            H += 1
+        image_dim_out = D
+        # ((448/14)//2)**2
+        self.num_img_tokens = (H // 2)**2
+        self.base_feat_height_target = H
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+        self.image_attention_mask = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = True
+        self.with_learnable_separator = True
+        self.hd_transform_order = "sub_glb"
+        self.freeze_img_processor = False
+        self.crop_size = 448
+
+        # image token compression
+        self.image_token_compression_cls = 'avg_pool_2d'
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.base_feat_height_reduction = 1
+        self.base_feat_height_target = self.base_feat_height_target // 2
+
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform == self.with_learnable_separator, \
+        'use_hd_transform and with_learnable_separator should have same value'
+        assert self.use_hd_transform, \
+            'learnable separator is only for hd transform'
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+        self.sub_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, 1,
+                self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [
+            nn.Linear(image_dim_out * self.base_feat_height_reduction**2,
+                      dim_projection)
+        ]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.use_out_place_operations = False
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor,
+                         attention_mask=None) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(
+            img_embeds,
+            output_hidden_states=True,
+            patch_attention_mask=attention_mask)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature
+
+            use_token_compression = self.image_token_compression is not None
+            use_padding = getattr(self, 'img_processor_padding',
+                                  None) is not None
+            if use_token_compression or use_padding:
+                # reshape to 2D tensor
+                width = int(math.sqrt(patch_feature.size(1)))
+                patch_feature = patch_feature.view(-1, width, width,
+                                                   patch_feature.size(-1))
+                # convert to NCHW
+                patch_feature = patch_feature.permute(0, 3, 1, 2)
+
+                if use_padding:
+                    patch_feature = self.img_processor_padding(patch_feature)
+                if use_token_compression:
+                    patch_feature = self.image_token_compression(patch_feature)
+
+                # convert to NHWC
+                patch_feature = patch_feature.permute(0, 2, 3, 1)
+                patch_feature = patch_feature.view(
+                    -1,
+                    patch_feature.size(1) * patch_feature.size(2),
+                    patch_feature.size(-1))
+
+            return patch_feature
+
+        raise NotImplementedError
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                image_sizes: torch.Tensor,
+                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        image_sizes: [[h1, w1], [h2, w2]]
+        image_attention_mask: num_images x num_crops x 32 x 32
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+
+        # eg
+        # pixel_values: torch.Size([1, 7, 3, 448, 448])
+        # image_sizes: tensor([[ 896, 1344]], device='cuda:0')
+        # output: torch.Size([1, 1841, 3072])
+
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+
+        img_sizes = image_sizes
+        num_images, num_crops, c, h, w = pixel_values.shape
+        bs = num_images
+        pixel_values = pixel_values.flatten(0, 1)
+
+        img_features = self.get_img_features(
+            pixel_values,
+            image_attention_mask.type(torch.BoolTensor).flatten(
+                0, 1).to(target_device))
+
+        base_feat_height_target = self.base_feat_height_target
+        base_resolution = self.crop_size
+        base_feat_height_reduction = self.base_feat_height_reduction
+
+        base_feat_height = base_feat_width = int(np.sqrt(
+            img_features.shape[1]))
+        assert base_feat_height == base_feat_height_target \
+            and base_feat_width == base_feat_height_target, \
+                f'base_feat_height: {base_feat_height},"\
+                f" base_feat_width: {base_feat_width}, "\
+                f"expect {base_feat_height_target} features for hd transform'
+
+        # bs x max_num_crops x (24x24) x C
+        img_features = img_features.view(bs, -1,
+                                         base_feat_height * base_feat_width,
+                                         self.image_dim_out)
+        C = self.image_dim_out
+        H = base_feat_height
+
+        output_imgs = []
+        output_len = []
+        # training is tensor, inference is list
+        if isinstance(img_sizes, torch.Tensor):
+            img_sizes = img_sizes.view(-1, 2)
+        for _bs in range(bs):
+            h, w = img_sizes[_bs]
+            h = h // base_resolution
+            w = w // base_resolution
+            B_ = h * w
+
+            # 1 x (24x24) x 1024
+            global_img_feature = img_features[_bs, :1]
+
+            # 1 x 12 x 12 x 4096
+            glb_img = global_img_feature.reshape(1, H, H, C).reshape(
+                1, H // base_feat_height_reduction, base_feat_height_reduction,
+                H // base_feat_height_reduction, base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, H // base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C).contiguous()
+            temp_glb_GN = self.sub_GN.repeat(1,
+                                             H // base_feat_height_reduction,
+                                             1, 1)
+
+            # 1 x 156 x 4096
+            glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+
+            # (max_num_crops-1) x (12x12) x C
+            sub_img = img_features[_bs, 1:]
+            # 16x574x1024
+            # get rid of padding sub_img
+            sub_img = sub_img[:B_]
+
+            # (num_crops, 12, 2, 12, 2, 1024) ->
+            # (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+            sub_img = sub_img.reshape(B_, H, H, C).reshape(
+                B_, H // base_feat_height_reduction,
+                base_feat_height_reduction, H // base_feat_height_reduction,
+                base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    B_, -1, base_feat_height_reduction *
+                    base_feat_height_reduction * C).contiguous()
+            sub_img = sub_img.reshape(
+                1, h, w, base_feat_height // base_feat_height_reduction,
+                base_feat_width // base_feat_height_reduction,
+                -1).permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, h * base_feat_height // base_feat_height_reduction,
+                    w * base_feat_width // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C)
+
+            if image_attention_mask is not None and len(
+                    image_attention_mask) > 0:
+                reshaped_image_attention_mask = image_attention_mask[
+                    _bs, 1:B_ + 1, 0::2, 0::2].reshape(
+                        1, h, w,
+                        base_feat_height // base_feat_height_reduction,
+                        base_feat_width // base_feat_height_reduction).permute(
+                            0, 1, 3, 2, 4).reshape(
+                                1, h * base_feat_height //
+                                base_feat_height_reduction, w *
+                                base_feat_width // base_feat_height_reduction)
+                useful_height = int(
+                    reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(
+                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
+                temp_len = int(
+                    image_attention_mask[_bs, :B_ + 1, 0::2, 0::2].sum().item(
+                    )) + (useful_height +
+                          1) + base_feat_height // base_feat_height_reduction
+            else:
+                temp_sub_GN = self.sub_GN.repeat(
+                    1, h * base_feat_height // base_feat_height_reduction, 1,
+                    1)
+                temp_len = int((h * w + 1) * self.num_img_tokens + 1 +
+                               (h + 1) * base_feat_height //
+                               base_feat_height_reduction)
+
+            sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+            # (1, num_img_tokens, 1024*4)
+
+            # glb + sub
+            if self.hd_transform_order == 'glb_sub':
+                output_imgs.append(
+                    torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+            elif self.hd_transform_order == 'sub_glb':
+                output_imgs.append(
+                    torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+            else:
+                raise NotImplementedError(
+                    f'hd_transform_order = {self.hd_transform_order}, "\
+                        "not implemented')
+
+            #temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+            assert temp_len == output_imgs[-1].shape[
+                1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
+                    "{output_imgs[-1].shape[1]}'
+
+            output_len.append(temp_len)
+
+        img_set_tensor = []
+        for _output_img in output_imgs:
+            img_feature_proj = self.img_projection(
+                _output_img.to(target_device).to(target_dtype))
+            img_set_tensor.append(img_feature_proj)
+
+        return img_set_tensor
+
+
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Tuple[NestedTensors]
+    """Shape: `((batch_size, num_audios, 80, M), )"""
+
+
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+
+
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negative"
+    assert (fmin < fmax <=
+            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
+
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+
+    khi = max(khi, klo)
+
+    # Spec 2: SpeechLib uses triangles in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+
+    return matrix
+
+
+class LogFbankProcessor:
+
+    def __init__(self):
+
+        self._eightk_method = "fillzero"
+        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
+
+        self._hamming400 = np.hamming(400)  # for 16k audio
+        self._hamming200 = np.hamming(200)  # for 8k audio
+
+    def extract_spectrogram(self, wav, fs):
+        """Extract spectrogram features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        if wav.ndim > 1:
+            wav = np.squeeze(wav)
+
+        # by default, we extract the mean if stereo
+        if len(wav.shape) == 2:
+            wav = wav.mean(1)
+
+        # Resample to 16000 or 8000 if needed
+        if fs > 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
+            fs = 16000
+        elif 8000 < fs < 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
+            fs = 8000
+        elif fs < 8000:
+            raise RuntimeError(f"Unsupported sample rate {fs}")
+
+        if fs == 8000:
+            if self._eightk_method == "resample":
+                # Input audio is 8 kHz. Convert to 16 kHz before feature
+                # extraction
+                wav = scipy.signal.resample_poly(wav, 2, 1)
+                fs = 16000
+            # Do nothing here for fillzero method
+        elif fs != 16000:
+            # Input audio is not a supported sample rate.
+            raise RuntimeError(
+                f"Input data using an unsupported sample rate: {fs}")
+
+        preemphasis = 0.97
+
+        if fs == 8000:
+            n_fft = 256
+            win_length = 200
+            hop_length = 80
+            fft_window = self._hamming200
+        elif fs == 16000:
+            n_fft = 512
+            win_length = 400
+            hop_length = 160
+            fft_window = self._hamming400
+
+        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
+        n_batch = (wav.shape[0] - win_length) // hop_length + 1
+        # Here we don't use stride_tricks since the input array may not satisfy
+        # memory layout requirement and we need writeable output
+        # Here we only use list of views before copy to destination
+        # so it is more efficient than broadcasting
+        y_frames = np.array(
+            [
+                wav[_stride:_stride + win_length]
+                for _stride in range(0, hop_length * n_batch, hop_length)
+            ],
+            dtype=np.float32,
+        )
+
+        # Spec 2: SpeechLib applies preemphasis within each batch
+        y_frames_prev = np.roll(y_frames, 1, axis=1)
+        y_frames_prev[:, 0] = y_frames_prev[:, 1]
+        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
+
+        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
+                        axis=1).astype(np.complex64)
+
+        if fs == 8000:
+            # Need to pad the output to look like 16 kHz data but with zeros in
+            # the 4 to 8 kHz bins.
+            frames, bins = S.shape
+            padarray = np.zeros((frames, bins))
+            S = np.concatenate((S[:, 0:-1], padarray),
+                               axis=1)  # Nyquist bin gets set to zero
+
+        spec = np.abs(S).astype(np.float32)
+        return spec
+
+    def extract_features(self, wav, fs):
+        """Extract log filterbank features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        spec = self.extract_spectrogram(wav, fs)
+        spec_power = spec**2
+
+        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
+        log_fbank = np.log(fbank_power).astype(np.float32)
+
+        return log_fbank
+
+
+@lru_cache
+def audio_feature_extractor() -> LogFbankProcessor:
+    # Creates an instance of the audio processor, needed to extract the
+    # the audio features from the sound file
+    # LRU cache ensures that we only make one copy
+    return LogFbankProcessor()
+
+
+def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
+                              vit_patch_size, token_compression_factor):
+    """
+    compute the number of tokens an image is expected to take up considering 
+    the image encoder architecture and exclude output features containing 
+    only padding pixels
+
+    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+    32x32 feature map
+    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+    """
+    assert vit_image_size % vit_patch_size == 0, \
+        "vit_image_size must be divisible by vit_patch_size"
+    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
+        "vit_image_size // vit_patch_size must be divisible by "\
+            "token_compression_factor"
+
+    target_aspect_ratio, target_height, target_width = (
+        _find_target_aspect_ratio(image,
+                                  vit_image_size,
+                                  dynamic_hd_size,
+                                  min_num=1))
+    assert target_aspect_ratio[
+        0] * vit_image_size == target_width, \
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+    assert target_aspect_ratio[
+        1] * vit_image_size == target_height, \
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+    assert (target_height % vit_image_size == 0
+            and target_width % vit_image_size == 0)
+
+    padding_height, padding_width = _get_padding_size(image, target_height,
+                                                      target_width)
+    assert padding_width == 0 or padding_height == 0, \
+        "padding_width or padding_height must be 0"
+
+    target_feat_width = target_width // vit_patch_size
+    target_feat_height = target_height // vit_patch_size
+    if padding_width >= vit_patch_size:
+        assert padding_height == 0, "padding_height not 0"
+        non_pad_feat_width = target_feat_width - math.floor(
+            padding_width / vit_patch_size)
+        non_pad_feat_height = target_feat_height
+    elif padding_height >= vit_patch_size:
+        assert padding_width == 0, "padding_width not 0"
+        non_pad_feat_height = target_feat_height - math.floor(
+            padding_height / vit_patch_size)
+        non_pad_feat_width = target_feat_width
+    else:
+        # small padding shorter than a vit patch
+        non_pad_feat_width = target_feat_width
+        non_pad_feat_height = target_feat_height
+
+    feat_width = non_pad_feat_width // token_compression_factor
+    feat_height = non_pad_feat_height // token_compression_factor
+    # NOTE it's possible that the non-padding feature is not divisible
+    if non_pad_feat_width % token_compression_factor != 0:
+        feat_width += 1
+    if non_pad_feat_height % token_compression_factor != 0:
+        feat_height += 1
+    num_hd_patch_tokens = feat_width * feat_height
+    num_hd_newline_tokens = feat_height
+    vit_feature_size = vit_image_size // vit_patch_size
+    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
+    num_sep_tokens = 1
+    num_global_image_newline_tokens = \
+        vit_feature_size // token_compression_factor
+
+    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
+            num_hd_newline_tokens + num_global_image_newline_tokens)
+
+
+def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
+    """
+    Compute the output size of the `extract_features` method.
+
+    Args:
+        wav_length (int): Length of the input waveform in samples.
+        fs (int): Sampling rate of the waveform, either 16000 or 8000.
+
+    Returns:
+        tuple (int, int): Output size as (T, D), where:
+            T: Number of time frames.
+            D: Number of Mel filterbank bins (80).
+    """
+
+    # Resample to 16000 or 8000 if needed
+    if fs > 16000:
+        wav_length //= fs // 16000
+        fs = 16000
+    elif 8000 <= fs < 16000:
+        # We'll resample to 16K from 8K
+        wav_length *= 2
+        fs = 16000
+    elif fs < 8000:
+        raise RuntimeError(f"Unsupported sample rate {fs}")
+
+    # Spectrogram parameters for 16 kHz
+    win_length = 400  # Frame length in samples
+    hop_length = 160  # Frame shift in samples
+    mel_bins = 80  # Number of mel filterbank bins
+
+    # Calculate number of frames (T)
+    T = (wav_length - win_length) // hop_length + 1
+    if T < 1:
+        raise ValueError("Waveform too short for given parameters.")
+
+    # Return time frames (T) and mel bins (D)
+    return T, mel_bins
+
+
+def _get_audio_embed_sizes(audios, ctx: InputContext):
+    """
+    Get the audio embedding sizes for each audio file.
+
+    Args:
+        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
+            waveform and sample rate.
+        ctx (InputContext): Input context.
+
+    Returns:
+        List[int]: List of audio embedding sizes.
+    """
+    audio_embed_sizes = []
+    for audio in audios:
+        audio_data, sf = audio
+        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
+        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
+                                                     audio_frames)
+        audio_embed_sizes.append(audio_embed_size)
+    return audio_embed_sizes
+
+
+def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
+    """
+    The following will search for `<|audio_{idx}|>` tokens and
+    return a mapping of audio placeholder tokens to audio placeholder token ids
+    based on the size of the audio embeddings.
+
+    Args:
+        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
+            waveform and sample rate.
+        ctx (InputContext): Input context.
+        prompt_str (str): The prompt string.
+
+    Returns:
+        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
+        placeholder token ids.
+
+    """
+    if len(audios) == 0:
+        return {}
+
+    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
+    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
+    audio_ids = [int(audio_id) for audio_id in audio_ids]
+    assert len(audio_ids) == len(
+        audio_embed_sizes
+    ), "Number of audio tokens and audio features do not match"
+    assert tuple(audio_ids) == tuple(range(1,
+                                           len(audio_ids) +
+                                           1)), "Audio ids are not in order!"
+    audio_id_to_input_ids = {
+        f"<|audio_{audio_id}|>":
+        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
+    }
+
+    return audio_id_to_input_ids
+
+
+def _count_image_tokens(images, ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+    token_compression_factor = prepro_config['token_compression_factor']
+
+    image_token_counts = [
+        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
+                                  vit_patch_size, token_compression_factor)
+        for image in images
+    ]
+    return image_token_counts
+
+
+def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
+    if len(images) == 0:
+        return {}
+
+    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
+    image_ids = [int(image_id) for image_id in image_ids]
+    assert len(image_ids) == len(
+        set(image_ids)), "Duplicate image tokens in prompt"
+    assert len(images) == len(
+        image_ids), "Number of images and image tokens in prompt do not match"
+
+    # NOTE the following assertion is not strictly necessary
+    assert tuple(image_ids) == tuple(range(1,
+                                           len(image_ids) +
+                                           1)), "Image ids are not in order"
+
+    image_token_counts = _count_image_tokens(images, ctx)
+    image_id_to_input_ids = {
+        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
+        for image_id, num_tokens in zip(image_ids, image_token_counts)
+    }
+    return image_id_to_input_ids
+
+
+def input_processor_for_phi4mm(ctx: InputContext,
+                               inputs: DecoderOnlyInputs) -> TokenInputs:
+    """
+    Implements the input processor, which transforms the input prompt ids
+    to include the audio placeholder token.  This will become the `input_ids`
+    in `forward` for the model.
+
+    Args:
+        ctx (InputContext): Input context.
+        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
+        to process.
+
+    Returns:
+        TokenInputs: Processed inputs
+    """
+    multi_modal_data = inputs.get("multi_modal_data")
+    if (multi_modal_data is None or
+        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
+        # pure text input, so no need to do pre-processing
+        return inputs
+
+    prompt_str = inputs.get("prompt")
+    prompt_token_ids = inputs.get("prompt_token_ids")
+    # for offline_inference, we will get str input and we parse MM special
+    # tokens from it
+    # (ignore prompt_token_ids)
+    # for OAI server, we will get prompt_token_ids, where MM special tokens
+    # are already parsed
+
+    if 'audio' in multi_modal_data:
+        audios = multi_modal_data["audio"]
+
+        if not isinstance(audios, list):
+            audios = [audios]
+        if prompt_str is not None:
+            audio_id_to_input_ids = _get_audio_id_to_input_ids(
+                audios, ctx, prompt_str=prompt_str)
+            audio_embed_sizes = []
+        elif prompt_token_ids is not None:
+            audio_id_to_input_ids = {}
+            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
+    else:
+        audio_id_to_input_ids = {}
+        audio_embed_sizes = []
+
+    if 'image' in multi_modal_data:
+        # PIL Image or list of PIL Images
+        images = multi_modal_data["image"]
+        if not isinstance(images, list):
+            images = [images]
+        if prompt_str is not None:
+            image_id_to_input_ids = _get_image_id_to_input_ids(
+                images, prompt_str, ctx)
+            image_token_counts = []
+        elif prompt_token_ids is not None:
+            image_id_to_input_ids = {}
+            image_token_counts = _count_image_tokens(images, ctx)
+    else:
+        image_id_to_input_ids = {}
+        image_token_counts = []
+
+    # Handle the case where the prompt is a string and we need to manually
+    # tokenize it.
+    # In this case, the `audio_id_to_input_ids` dict will be mapping from
+    # an audio placeholder
+    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
+    # given audio length.
+    if prompt_str:
+        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
+        prompt_chunk_strings = re.split(pattern, prompt_str)
+        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
+
+        # Create the new input_ids with the placeholder image and audio
+        # tokens inserted
+        tokenizer = cached_tokenizer_from_config(ctx.model_config)
+        input_ids = []
+        has_imag, has_audio, has_user_text_input = False, False, False
+        for prompt_chunk_string in prompt_chunk_strings:
+            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
+                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
+                has_imag = True
+            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
+                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
+                has_audio = True
+            else:
+                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
+                if not has_user_text_input:
+                    for token_id in curr_token_ids:
+                        if token_id not in NON_USER_INPUT_TOKENS:
+                            has_user_text_input = True
+                            break
+                input_ids.extend(curr_token_ids)
+        if has_audio and has_imag and has_user_text_input:
+            raise ValueError(
+                "Phi4MMForCausalLM does not support text + audio + image" +
+                " inputs in the same prompt")
+    # Handle the case where the prompt is already tokenized
+    else:
+        assert prompt_token_ids is not None, \
+            "If string prompt isn't provided, prompt_token_ids must be"
+
+        i = 0
+        input_ids = prompt_token_ids
+        # only needed for later assertion
+        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
+        image_token_count_iter = iter(image_token_counts)
+        audio_embed_size_iter = iter(audio_embed_sizes)
+        while i < len(input_ids):
+            token_id = input_ids[i]
+            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
+                token_count = next(audio_embed_size_iter)
+                audio_cnt += 1
+            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
+                token_count = next(image_token_count_iter)
+                img_cnt += 1
+            else:
+                user_text_input_cnt += 1 if token_id not in \
+                    NON_USER_INPUT_TOKENS else 0
+                i += 1
+                continue
+            tokens = [token_id] * token_count
+            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
+            i += token_count
+
+        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
+            raise ValueError(
+                "Phi4MMForCausalLM does not support text + audio + image" +
+                " inputs in the same prompt")
+        # If the below assertion fails, it might be that input pure-text
+        # messages contain image/audio special tokens literally
+        # (<|endoftext10|>, <|endoftext11|>).
+        assert (img_cnt == len(image_token_counts)), (
+            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
+            f"does not match number of images ({len(image_token_counts)})")
+        assert (audio_cnt == len(audio_embed_sizes)), (
+            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
+            f"does not match number of audios ({len(audio_embed_sizes)})")
+
+    # NOTE: Create a defensive copy of the original inputs
+    return token_inputs(
+        prompt_token_ids=input_ids,
+        prompt=prompt_str,
+        multi_modal_data=multi_modal_data,
+    )
+
+
+def _compute_audio_embed_size(hf_config, audio_frames):
+    """
+    Compute the audio embedding size based on the audio frames and
+    compression rate.
+    """
+    compression_rate = hf_config.embd_layer['audio_embd_layer'][
+        'compression_rate']
+    # NOTE: this is a hard-coded value but might be configurable in the future
+    qformer_compression_rate = 1
+    integer = audio_frames // compression_rate
+    remainder = audio_frames % compression_rate
+
+    result = integer if remainder == 0 else integer + 1
+
+    integer = result // qformer_compression_rate
+    remainder = result % qformer_compression_rate
+    result = integer if remainder == 0 else integer + 1  # qformer compression
+
+    return result
+
+
+def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
+    return 10000
+
+
+def dummy_audio_for_phi4mm(audio_count: int) -> dict:
+    """
+    Create dummy audio data for the Phi4MM model, which is used for profiling.
+
+    Args:
+        audio_count (int): Number of audio samples.
+
+    Returns:
+        dict: Dummy audio data.
+    """
+    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
+    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
+
+
+def dummy_image_for_phi4mm(width: int, height: int):
+    image = Image.new('RGB', (width, height), color='black')
+    return image
+
+
+def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
+                          mm_counts: Mapping[str, int]) -> DummyData:
+    """
+    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
+    which is used for profiling.
+
+    In this case, the sequence data is a bunch of 0s with a number of audio 
+    tokens that correspond to the audio embed size of the 
+    _AUDIO_MAX_SOUNDFILE_SIZE.
+
+    Args:
+        ctx (InputContext): Input context.
+        seq_len (int): Length of the sequence.
+        mm_counts (Mapping[str, int]): Multi-modal counts.
+
+    Returns:
+        Tuple: Dummy sequence data and dummy audio data.
+    """
+    audio_count = mm_counts["audio"]
+    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
+                                                   DUMMY_SAMPLING_FREQUENCY)
+    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
+                                                   audio_frames)
+
+    image_count = mm_counts["image"]
+    dummy_image = get_max_dummy_image(ctx)
+    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
+    total_image_tokens = image_count * max_image_tokens
+
+    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
+        raise RuntimeError(
+            f"Phi4MM cannot process {audio_count} audios and {image_count}"
+            f"images in a prompt, please increase max_model_len to be at"
+            f" larger than "
+            f"{audio_feature_size * audio_count + total_image_tokens}"
+            " or reduce audio/image limit by --limit-mm-per-prompt.")
+
+    if audio_feature_size * audio_count > total_image_tokens:
+        seq_data = SequenceData.from_prompt_token_counts(
+            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
+            (0, seq_len - audio_feature_size * audio_count),
+        )
+        mm_data = {
+            "audio": dummy_audio_for_phi4mm(audio_count),
+        }
+    else:
+        seq_data = SequenceData.from_prompt_token_counts(
+            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
+            (0, seq_len - total_image_tokens),
+        )
+        mm_data = {
+            "image": [dummy_image] * image_count,
+        }
+    return DummyData(seq_data, mm_data)
+
+
+def input_mapper_for_phi4mm_audio(ctx: InputContext,
+                                  data: object) -> MultiModalInputs:
+    """
+    This function is used to create the MultiModalInputs for the Phi4MM 
+    (audio) model.
+    Specifically, for audio, we extract the audio features from the sound 
+    file and create pairs of audio features and audio embed lengths (the
+    latter of which is used to repeat the audio placeholder token in the 
+    input prompt IDs).
+    These pairs are used, downstream, in `_audio_features_to_embeddings`
+    (via `_process_audio_input`).
+
+    Note that the incoming audio data (each entry in `data`) is a tuple of 
+    the audio data and the sampling frequency (e.g. from soundfile.read).
+
+    Args:
+        ctx (InputContext): Input context.
+        data (object): Audio data.
+
+    Returns:
+        MultiModalInputs: Multi-modal inputs.
+    """
+    if not isinstance(data, list):
+        data = [data]
+
+    if len(data) == 0:
+        return MultiModalInputs()
+
+    audio_features = []
+    for audio_input in data:
+        if not isinstance(audio_input, tuple):
+            raise NotImplementedError(
+                f"Unsupported data type: {type(audio_input)}")
+
+        audio, sf = audio_input
+        feature_extractor = audio_feature_extractor()
+        single_audio_features = feature_extractor.extract_features(audio, sf)
+        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
+                       feature_extractor.stride)
+        audio_frames = len(single_audio_features) * feat_stride
+        single_audio_embed_size = _compute_audio_embed_size(
+            ctx.get_hf_config(), audio_frames)
+        single_audio_feature_audio_len_pair = (
+            single_audio_features,
+            [single_audio_embed_size],
+        )
+        audio_features.append(single_audio_feature_audio_len_pair)
+    return MultiModalInputs({"audio_features": audio_features})
+
+
+def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
+    if not isinstance(data, list):
+        data = [data]
+    # data: list of PIL images
+    if len(data) == 0:
+        return MultiModalInputs()
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+
+    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
+                                  vit_patch_size)
+    return MultiModalInputs({
+        "pixel_values":
+        image_input_dict["pixel_values"],
+        "image_sizes":
+        image_input_dict["image_sizes"],
+        "image_attention_mask":
+        image_input_dict["image_attention_mask"],
+        "num_img_tokens":
+        image_input_dict["num_img_tokens"],
+    })
+
+
+def cat_with_pad(tensors, dim, padding_value=0):
+    """
+    cat along dim, while pad to max for all other dims
+    """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
+
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
+
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
+
+        output[slices] = t
+        index += t.shape[dim]
+
+    return output
+
+
+@MULTIMODAL_REGISTRY.register_input_mapper("audio",
+                                           input_mapper_for_phi4mm_audio)
+@MULTIMODAL_REGISTRY.register_input_mapper("image",
+                                           input_mapper_for_phi4mm_image)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "audio", get_max_phi4mm_audio_tokens)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "image", get_max_phi4mm_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+    """
+    Implements the Phi-4-multimodal-instruct model in VLLM.
+    """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "base_layer.": "",
+        },
+        orig_to_new_prefix={
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.":
+            "embed_tokens_extend.audio_projection_for_vision.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.":
+            "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
+            "model.embed_tokens_extend.image_embed.": "vision_encoder.",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        assert multimodal_config, "multimodal_config is required"
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+
+        # Tensor/Pipeline parallel not supported for now.
+        assert get_pp_group(
+        ).world_size == 1, "pipeline parallel is not supported"
+
+        self.vision_encoder = Phi4MMImageEncoder(
+            config,
+            quant_config,
+            prefix="model.vision_embed_tokens",
+            model_dir=config._name_or_path)
+
+        if isinstance(config.embd_layer["audio_embd_layer"], dict):
+            embedding_config = {
+                "embedding_cls":
+                config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                **config.embd_layer["audio_embd_layer"],
+            }
+        else:
+            embedding_config = {
+                "embedding_cls": self.config.embd_layer["embedding_cls"]
+            }
+
+        self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _audio_features_to_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        input_features: List[torch.Tensor],
+        audio_input_sizes: torch.Tensor,
+        audio_projection_mode: str,
+    ) -> torch.Tensor:
+        """
+        Convert audio features to embeddings, which are used as input to the 
+        model (via `inputs_embeds`).
+
+        Args:
+            input_ids (torch.Tensor): Input IDs (the prompt in this case).
+            input_features (list[torch.Tensor]): Input features (the audio 
+            embeddings).
+            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
+            audio embed lengths to use for padding the audio placeholder token 
+            in the input prompt IDs).
+        """
+        # The audio projection can either be a single linear or Sequential,
+        # so handle both cases
+        if isinstance(self.embed_tokens_extend.audio_projection,
+                      nn.Sequential):
+            target_dtype = self.embed_tokens_extend.audio_projection[
+                0].bias.dtype
+        else:
+            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
+
+        audio_input = [
+            input.unsqueeze(0).to(target_dtype) for input in input_features
+        ]
+        kwargs = {
+            "wte": self.model.embed_tokens,
+            'audio_projection_mode': audio_projection_mode
+        }
+        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
+                                                    audio_input_sizes,
+                                                    **kwargs)
+        audio_embeddings = audio_embeddings.to(target_dtype)
+        return audio_embeddings
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        """
+        Parse and validate the audio input to the model.  This handles both 
+        audio features and audio embeddings, but only the former is used for
+        now.
+
+        Args:
+            kwargs (object): Keyword arguments.
+
+        Returns:
+            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
+        """
+        audio_features = kwargs.pop("audio_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+
+            return Phi4MMAudioFeatureInputs(type="audio_features",
+                                            data=audio_features)
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
+                                              data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(self, input_ids: torch.Tensor,
+                             audio_input: Phi4MMAudioInputs,
+                             audio_projection_mode: str) -> NestedTensors:
+        """
+        Create the audio embeddings from the audio input, where the audio input
+        is pairs of audio features and audio embed lengths.  The audio input is
+        created by `input_mapper_for_phi4mm_audio`.
+
+        Args:
+            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
+            before the audio token replication).
+            audio_input (Phi4MMAudioInputs): Audio input.
+
+        Returns:
+            NestedTensors: Audio embeddings
+        """
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        audio_features = audio_input["data"]
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+        audio_feature = [i[0] for j in audio_features for i in j]
+        audio_feature_len = [i[1].item() for j in audio_features for i in j]
+        # Add the batch dim via `squeeze`
+
+        return self._audio_features_to_embeddings(
+            input_ids.unsqueeze(0),
+            audio_feature,
+            audio_feature_len,
+            audio_projection_mode,
+        ).squeeze(0)
+
+    def _parse_and_validate_image_input(self,
+                                        **kwargs: object) -> Optional[Dict]:
+        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
+        if pixel_values is None:
+            return None
+
+        image_sizes = kwargs.get("image_sizes")
+        image_attention_mask = kwargs.get("image_attention_mask")
+        num_img_tokens = kwargs.get("num_img_tokens")
+        assert image_sizes is not None and image_attention_mask is not None\
+              and num_img_tokens is not None, "Missing image inputs"
+
+        if isinstance(pixel_values, list):
+            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+            # list len is batch_size.
+            # each tensor has dimension: num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # need to pad along num_hd_patches.
+            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
+            pixel_values = cat_with_pad(pixel_values, dim=0)
+        elif isinstance(pixel_values, torch.Tensor):
+            # dimension: batch_size, num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # we flatten first 2 dims to make it a single large batch for
+            # SigLIP Encoder.
+            assert pixel_values.dim() == 6, "Incorrect image inputs"
+            pixel_values = pixel_values.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect pixel_values inputs")
+
+        if isinstance(image_attention_mask, list):
+            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
+        elif isinstance(image_attention_mask, torch.Tensor):
+            image_attention_mask = image_attention_mask.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(image_sizes, list):
+            image_sizes = torch.cat(image_sizes, dim=0)
+        elif isinstance(image_sizes, torch.Tensor):
+            image_sizes = image_sizes.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(num_img_tokens, list):
+            num_img_tokens = [
+                n for num_tensor in num_img_tokens
+                for n in num_tensor.tolist()
+            ]
+        elif isinstance(num_img_tokens, torch.Tensor):
+            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        return {
+            'pixel_values': pixel_values,
+            'image_sizes': image_sizes,
+            'image_attention_mask': image_attention_mask,
+            'num_img_tokens': num_img_tokens,
+        }
+
+    def merge_image_features_to_inputs_embeds(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_set_tensors: List[torch.Tensor],
+    ):
+        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
+            as_tuple=True)
+
+        assert all([t.shape[0] == 1 for t in image_set_tensors
+                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
+        # Shape: (merged_N_tokens, C)
+        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
+        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
+            inputs_embeds.device)
+        merged_embeds = inputs_embeds.index_put(
+            indices=position_tuple,
+            values=image_set_tensor,
+            accumulate=False,
+        )
+        return merged_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+        else:
+            # Each entry in this is a pair of audio_features and audio_embed
+            # lengths
+            audio_input = self._parse_and_validate_audio_input(**kwargs)
+            image_inputs = self._parse_and_validate_image_input(**kwargs)
+
+            has_audio = audio_input is not None
+            has_image = image_inputs is not None
+
+            if has_audio:
+                audio_projection_mode = 'vision' if has_image else 'speech'
+                inputs_embeds = self._process_audio_input(
+                    input_ids, audio_input, audio_projection_mode)
+
+            if has_image:
+                dtype = self.vision_encoder.img_processor.embeddings.\
+                    patch_embedding.weight.dtype
+                pixel_values = image_inputs['pixel_values'].to(dtype)
+                image_sizes = image_inputs['image_sizes']
+                image_attention_mask = image_inputs['image_attention_mask']
+                image_set_tensors = self.vision_encoder(
+                    pixel_values, image_sizes, image_attention_mask)
+                if not has_audio:
+                    inputs_embeds = self.model.embed_tokens(input_ids)
+
+                inputs_embeds = self.merge_image_features_to_inputs_embeds(
+                    input_ids, inputs_embeds, image_set_tensors)
+
+            if has_image or has_audio:
+                # multi-modal input, we have set inputs_embeds properly in
+                # previous steps
+                input_ids = None
+            else:
+                # text-only, we keep using original input_ids
+                inputs_embeds = None
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        weights = ((name, data) for name, data in weights
+                   if "lora" not in name)
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.",
+            connector=["audio_projection_for_vision", "audio_projection"],
+            tower_model=["vision_encoder", "embed_tokens_extend"],
+        )
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
new file mode 100644
index 000000000000..db90848f9809
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -0,0 +1,1271 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import abc
+import math
+from typing import List, Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointWrapper)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel)
+from transformers import PretrainedConfig
+
+from vllm.model_executor.models.phi4mm_utils import (
+    AbsolutePositionalEncoding, ConvModule, FeedForward, MeanVarianceNormLayer,
+    MultiHeadedAttention, MultiSequential, NemoConvSubsampling,
+    T5RelativeAttentionLogitBias, adaptive_enc_mask, get_offset, unfold_tensor)
+
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
+
+
+class ConformerEncoderLayer(nn.Module):
+    """ConformerEncoder Layer module.
+    for more details see conformer paper:
+        https://arxiv.org/abs/2005.08100
+    This module implement the Conformer block layer.
+
+    Args:
+        d_model: int
+            attention dim.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a 
+             channel_out of the second conv1d layer. 
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        n_head: int
+            the number of heads for multihead attention module.
+        d_ffn: int
+            output size of the feed_forward blocks.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        activation: str, optional
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "relu".
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        chunk_size: int, optional
+            chunk_size for cnn. default 18
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation function used for the glu inside
+            the ConvModule part of the conformer.
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_innner_dim: int, optional
+            if equal to -1, attention dim for linears k/q/v is
+            equal to d_model. otherwise attention_innner_dim is used.
+            default -1.
+        attention_glu_type: str, optional
+            activation function for glu used in the multihead attention,
+             default "swish".
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set to True, use pytorch's scaled dot product attention 
+            implementation in training.
+        attn_group_sizes: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
+            attn_group_sizes = attenion_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        ext_pw_out_channel=0,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        n_head=4,
+        d_ffn=2048,
+        ext_pw_kernel_size=1,
+        kernel_size=3,
+        dropout_rate=0.1,
+        causal=False,
+        batch_norm=False,
+        activation="relu",
+        chunk_se=0,
+        chunk_size=18,
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_innner_dim=-1,
+        attention_glu_type="swish",
+        activation_checkpointing="",
+        export=False,
+        use_pt_scaled_dot_product_attention=False,
+        attn_group_sizes: int = 1,
+    ):
+        super().__init__()
+
+        self.feed_forward_in = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.self_attn = MultiHeadedAttention(
+            n_head,
+            d_model,
+            dropout_rate,
+            attention_innner_dim,
+            attention_glu_type,
+            bias_in_glu,
+            use_pt_scaled_dot_product_attention=
+            use_pt_scaled_dot_product_attention,
+            group_size=attn_group_sizes,
+        )
+        self.conv = ConvModule(
+            d_model,
+            ext_pw_out_channel,
+            depthwise_seperable_out_channel,
+            ext_pw_kernel_size,
+            kernel_size,
+            depthwise_multiplier,
+            dropout_rate,
+            causal,
+            batch_norm,
+            chunk_se,
+            chunk_size,
+            conv_activation,
+            conv_glu_type,
+            bias_in_glu,
+            linear_glu_in_convm,
+            export=export,
+        )
+
+        self.feed_forward_out = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.layer_norm_att = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        x,
+        pos_k,
+        pos_v,
+        mask,
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """ConformerEncoder forward.
+
+        Args:
+            x: torch.Tensor
+                input feature of shape (batch, max_time_in, size)
+            pos_k: torch.Tensor
+                positional key embedding.
+            mask: torch.Tensor
+                mask for x (batch, max_time_in)
+            relative_attention_bias: Optional[torch.Tensor]
+                bias added to attention logits w.r.t. relative positions 
+                (1, n_head, time1, time2)
+        """
+        x = x + 0.5 * self.feed_forward_in(x)
+        norm_x = self.layer_norm_att(x)
+
+        x = x + self.self_attn(
+            norm_x,
+            norm_x,
+            norm_x,
+            pos_k,
+            pos_v,
+            mask,
+            relative_attention_bias=relative_attention_bias,
+        )
+        x = x + self.conv(x)
+        x = x + 0.5 * self.feed_forward_out(x)
+
+        out = self.layer_norm(x)
+
+        return out, pos_k, pos_v, mask
+
+
+class TransformerEncoderBase(abc.ABC, nn.Module):
+    """The Base class for Transformer based encoders
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        time_reduction: int, optional
+            time reduction factor
+            default 4
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        padding_idx: int, optional
+            padding index for input_layer=embed
+            default -1
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        positional_dropout_rate: float, optional
+            dropout rate after positional encoding. default 0.0
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default None
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True).
+            if True or feat_time, the extra padding is added into non full
+            supraframe utts in batch.
+            Default: none
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query 
+            Attention
+            attention_group_size = attenion_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        attention_dim=256,
+        attention_heads=4,
+        input_layer="nemo_conv",
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        time_reduction=4,
+        dropout_rate=0.0,
+        padding_idx=-1,
+        relative_attention_bias_args=None,
+        positional_dropout_rate=0.0,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.input_layer = input_layer
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.attention_dim = attention_dim
+        self.num_heads = attention_heads
+        self.attention_group_size = attention_group_size
+        self.time_reduction = time_reduction
+        self.nemo_conv_settings = nemo_conv_settings
+        self.encoder_embedding_config = encoder_embedding_config
+
+        if self.input_layer == "nemo_conv":
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.time_reduction,
+                "feat_in": input_size,
+                "feat_out": attention_dim,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.embed = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.pos_emb = AbsolutePositionalEncoding(attention_dim,
+                                                  positional_dropout_rate)
+
+        self.relative_attention_bias_type = (
+            relative_attention_bias_args.get("type")
+            if relative_attention_bias_args else None)
+        if self.relative_attention_bias_type == "t5":
+            assert (self.num_heads % self.attention_group_size == 0
+                    ), "attention_group_size must divide n_head"
+            self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
+                self.num_heads // self.attention_group_size,
+                max_distance=relative_attention_bias_args.get(
+                    "t5_bias_max_distance", 1000),
+                symmetric=relative_attention_bias_args.get(
+                    "t5_bias_symmetric", False),
+            )
+        else:
+            raise NotImplementedError
+
+        self.encoder_embedding = MeanVarianceNormLayer(
+            self.encoder_embedding_config["input_size"])
+
+    def compute_lens_change(self, feature_lens):
+        """feature_lens: int
+        return updated feature lens.
+
+        This used to return a different lambda function for each case that 
+        computed the right thing.  That does not work within Torchscript. 
+        If you really need this to be faster, create nn.Module()-s for all
+        the cases and return one of them.  Torchscript does support that.
+        """
+        if self.input_layer == "nemo_conv":
+            # Handle the special causal case
+            subsampling_causal_cond = self.nemo_conv_settings.get(
+                "subsampling", "dw_striding") in [
+                    "dw_striding",
+                    "striding",
+                    "striding_conv1d",
+                ]
+            is_causal = self.nemo_conv_settings.get("is_causal", False)
+            if is_causal and subsampling_causal_cond:
+                lens_change = (torch.ceil(feature_lens /
+                                          self.time_reduction).long()
+                               if isinstance(feature_lens, Tensor) else
+                               math.ceil(feature_lens / self.time_reduction))
+                feature_lens_remainder = feature_lens % self.time_reduction
+                if isinstance(feature_lens, Tensor):
+                    lens_change[feature_lens_remainder != 1] += 1
+                elif feature_lens_remainder != 1:
+                    lens_change += 1
+                return lens_change
+            ceil_func = (math.ceil
+                         if isinstance(feature_lens, int) else torch.ceil)
+            return ceil_func(feature_lens / self.time_reduction)
+
+    @abc.abstractmethod
+    def forward(self):
+        """Abstract forward method implementation."""
+
+    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+        """If chunk size is a list, we will randomly select a chunk size."""
+
+        if chunk_size is None:
+            chunk_size = self.chunk_size
+        if left_chunk is None:
+            left_chunk = self.left_chunk
+        if isinstance(chunk_size, list):
+            # Variable chunk size during training
+            chunk_size_index = int(
+                torch.randint(low=0, high=len(chunk_size), size=(1, )))
+            chunk_size_train_eff = chunk_size[chunk_size_index]
+            if not isinstance(left_chunk, list):
+                raise ValueError(
+                    "Since chunk_size is a list, left_chunk must be a list")
+            if len(left_chunk) != len(chunk_size):
+                raise ValueError(
+                    "The length of left_chunk must be the same as length of "\
+                        "chunk_size."
+                )
+            left_chunk_train_eff = left_chunk[chunk_size_index]
+        else:
+            chunk_size_train_eff = chunk_size
+            left_chunk_train_eff = left_chunk
+
+        return chunk_size_train_eff, left_chunk_train_eff
+
+    def _get_embed_class(self, embed):
+        # pylint: disable=protected-access
+        is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
+        is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
+        embed_class = embed
+        if is_embed_using_act_chkpt:
+            embed_class = embed._checkpoint_wrapped_module
+        if is_embed_fsdp_wrapped:
+            embed_class = embed.module
+        return embed_class
+
+    def _forward_embeddings_core(self, input_tensor, masks):
+        embed_class = self._get_embed_class(self.embed)
+        assert isinstance(embed_class, NemoConvSubsampling)
+        input_tensor, masks = self.embed(input_tensor, masks)
+        return input_tensor, masks
+
+    def _position_embedding(self, input_tensor):
+        pos_k = None
+        pos_v = None
+        if self.relative_attention_bias_layer is None:
+            input_tensor = self.pos_emb(
+                input_tensor)  # default to add abs sinusoid embedding
+        return pos_k, pos_v
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        chunk_size_train_eff, left_chunk_train_eff = \
+            self._chunk_size_selection(chunk_size, left_chunk)
+
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
+
+        enc_streaming_mask = (adaptive_enc_mask(
+            seq_len, chunk_start_idx,
+            left_window=left_chunk_train_eff).unsqueeze(0).expand(
+                [batch_size, -1, -1]))
+        return enc_streaming_mask
+
+    def forward_embeddings(self,
+                           xs_pad,
+                           masks,
+                           chunk_size_nc=None,
+                           left_chunk_nc=None):
+        """Forwarding the inputs through the top embedding layers
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                input mask
+            chunk_size_nc: (optional, default is None) chunk size for 
+                            non-causal layers
+            left_chunk_nc: (optional, default is None) # of left chunks for
+                            non-causal layers
+        """
+        # pylint: disable=R0915
+        # get new lens.
+        seq_len = int(self.compute_lens_change(xs_pad.shape[1]))
+        if seq_len <= 0:
+            raise ValueError(
+                f"""The sequence length after time reduction is invalid: 
+                {seq_len}. Your input feature is too short. Consider 
+                filtering out the very short sentence from data 
+                loader""", )
+
+        batch_size = xs_pad.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+
+        if xs_pad.is_cuda:
+            enc_streaming_mask = enc_streaming_mask.cuda()
+            xs_pad = xs_pad.cuda()
+
+        input_tensor = xs_pad
+        input_tensor, masks = self._forward_embeddings_core(
+            input_tensor, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        if chunk_size_nc is not None:
+            enc_streaming_mask_nc = self._streaming_mask(
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc)
+            if xs_pad.is_cuda:
+                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
+            if masks is not None:
+                hs_mask_nc = masks & enc_streaming_mask_nc
+            else:
+                hs_mask_nc = enc_streaming_mask_nc
+        else:
+            hs_mask_nc = None
+
+        pos_k, pos_v = self._position_embedding(input_tensor)
+
+        if chunk_size_nc is None:
+            return input_tensor, pos_k, pos_v, hs_mask, masks
+        return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
+
+    def get_offset(self):
+        """Returns offset used when retaining inputs for decoding.
+
+        This is essentially, how many additional frames have to be added to
+        the front-end CNN input to ensure it can produce a single output.
+        So if the "padding" parameter is 0, typically offset will be > 0.
+        """
+        return get_offset(self.input_layer, self.time_reduction)
+
+
+class ConformerEncoder(TransformerEncoderBase):
+    """ConformerEncoder module.
+    see original paper for more details:
+        https://arxiv.org/abs/2005.08100
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        left_chunk: int
+            number of chunks used for masking in streaming mode.
+        num_lang: int
+            This parameter is used to store the number of languages in the 
+            lang_dict, only used for multiseed/multilingual models. 
+            default None.
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        linear_units:
+            the number of units of position-wise feed forward.
+            default 2048
+        num_block:
+            number of Transformer layer. default 6
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        ext_pw_out_channel: int, optional
+            the number of channel for CNN
+            before depthwise_seperable_CNN.
+            If 0 then use linear. default 0.
+        ext_pw_kernel_size: int, optional
+            kernel size of N before depthwise_seperable_CNN.
+            only work for ext_pw_out_channel > 0.
+            default 1
+        depthwise_seperable_out_channel: int, optional
+            the number of channel for
+            depthwise_seperable_CNN.
+            default 256.
+        depthwise_multiplier: int, optional
+            the number of multiplier for
+            depthwise_seperable_CNN.
+            default 1.
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        kernel_size: int, optional
+            the number of kernels for depthwise_seperable_CNN.
+            default 3.
+        activation: str, optional
+            FeedForward block activation.
+            one of ["relu", "swish", "sigmoid"]
+            default "relu".
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation used use glu in depthwise_seperable_CNN,
+            default "sigmoid"
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU. default True
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_glu_type: str
+            only work for glu_in_attention !=0
+            default "swish".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        extra_layer_output_idx: int
+            the layer index to be exposed.
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention 
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        time_reduction: int optional
+            time reduction factor
+            default 4
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled 
+            dot product attention in training.
+            Default: False
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default: None
+            usage: nemo_conv_settings=
+                {
+                    "subsampling":
+                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
+                    "conv_channels": int,
+                    "subsampling_conv_chunking_factor": int,
+                    "is_causal": True/False
+                }
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True)
+            Default: none
+        replication_pad_for_subsample_embedding:  For batched-streaming 
+            decoding, use "replication" padding for the cache at start of
+            utterance.
+            Default: False
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attenion_heads = Multi-Query Attention
+    """
+
+    extra_multi_layer_output_idxs: List[int]
+
+    def __init__(  # pylint: disable-all
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        num_lang=None,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        input_layer="nemo_conv",
+        causal=True,
+        batch_norm=False,
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        ext_pw_out_channel=0,
+        ext_pw_kernel_size=1,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        chunk_se=0,
+        kernel_size=3,
+        activation="relu",
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_glu_type="swish",
+        export=False,
+        extra_layer_output_idx=-1,
+        extra_multi_layer_output_idxs=[],  # noqa
+        activation_checkpointing="",
+        relative_attention_bias_args=None,
+        time_reduction=4,
+        use_pt_scaled_dot_product_attention=False,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        replication_pad_for_subsample_embedding=False,
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__(
+            input_size,
+            chunk_size,
+            left_chunk,
+            attention_dim,
+            attention_heads,
+            input_layer,
+            cnn_out,
+            cnn_layer_norm,
+            time_reduction,
+            dropout_rate=dropout_rate,
+            relative_attention_bias_args=relative_attention_bias_args,
+            positional_dropout_rate=0.0,
+            nemo_conv_settings=nemo_conv_settings,
+            conv2d_extra_padding=conv2d_extra_padding,
+            attention_group_size=attention_group_size,
+            encoder_embedding_config=encoder_embedding_config,
+        )
+        self.num_blocks = num_blocks
+        self.num_lang = num_lang
+        self.kernel_size = kernel_size
+        self.replication_pad_for_subsample_embedding: bool = (
+            replication_pad_for_subsample_embedding)
+        assert (self.num_heads % attention_group_size == 0
+                ), "attention_group_size must divide n_head"
+        self.num_heads_k = self.num_heads // attention_group_size
+
+        self.encoders = MultiSequential(*[
+            ConformerEncoderLayer(
+                d_model=attention_dim,
+                ext_pw_out_channel=ext_pw_out_channel,
+                depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                depthwise_multiplier=depthwise_multiplier,
+                n_head=attention_heads,
+                d_ffn=linear_units,
+                ext_pw_kernel_size=ext_pw_kernel_size,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate,
+                causal=causal,
+                batch_norm=batch_norm,
+                activation=activation,
+                chunk_se=chunk_se,
+                chunk_size=chunk_size,
+                conv_activation=conv_activation,
+                conv_glu_type=conv_glu_type,
+                bias_in_glu=bias_in_glu,
+                linear_glu_in_convm=linear_glu_in_convm,
+                attention_glu_type=attention_glu_type,
+                activation_checkpointing=activation_checkpointing,
+                export=export,
+                use_pt_scaled_dot_product_attention=
+                use_pt_scaled_dot_product_attention,
+                attn_group_sizes=attention_group_size,
+            ) for _ in range(num_blocks)
+        ])
+        self.extra_layer_output_idx = extra_layer_output_idx
+        self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
+        # Make a zeros scalar we can use in get_initial_state to determine
+        # the device and the needed dtype:
+        self.register_buffer("dev_type", torch.zeros(()), persistent=False)
+
+    def init_relative_attention_bias(self, input_tensor):
+        if self.relative_attention_bias_layer:
+            return self.relative_attention_bias_layer(input_tensor)
+
+    def calculate_hs_mask(self, xs_pad, device, mask):
+        max_audio_length = xs_pad.shape[1]
+        batch_size = xs_pad.shape[0]
+        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = (torch.arange(0, max_audio_length,
+                                 device=device).expand(padding_length.size(0),
+                                                       -1)
+                    < padding_length.unsqueeze(1))
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    @torch.jit.ignore
+    def forward(self, xs_pad, masks):
+        """Conformer Forward function
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                post-embedding input lengths
+        """
+        xs_pad = self.encoder_embedding(xs_pad)
+        input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
+            xs_pad, masks)
+
+        unfolded = False
+        ori_bz, seq_len, D = input_tensor.shape
+        max_seq_len = 500  #maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks
+            # of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple
+            # of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                input_tensor_pad = F.pad(input_tensor,
+                                         (0, 0, 0, chunk_pad_size), "constant",
+                                         0)
+                input_tensor = input_tensor_pad.to(input_tensor.device)
+            input_tensor = unfold_tensor(input_tensor, max_seq_len)
+            if masks is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = masks.squeeze(
+                    1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
+                    False)  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = \
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1).bool()  # unfold op does not support bool tensor
+            else:
+                masks_unfold = None
+            hs_mask = self.calculate_hs_mask(
+                input_tensor, input_tensor.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        # layer_emb = None
+
+        relative_attention_bias = self.init_relative_attention_bias(
+            input_tensor)
+
+        _simplified_path = (self.extra_layer_output_idx == -1
+                            and relative_attention_bias is None)
+
+        if _simplified_path:
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v,
+                                             hs_mask)
+        else:
+            for i, layer in enumerate(self.encoders):
+                input_tensor, _, _, _ = layer(
+                    input_tensor,
+                    pos_k,
+                    pos_v,
+                    hs_mask,
+                    relative_attention_bias=relative_attention_bias,
+                )
+
+                # if i == self.extra_layer_output_idx:
+                #     layer_emb = input_tensor
+
+        if unfolded:
+            embed_dim = input_tensor.shape[-1]
+            input_tensor = input_tensor.reshape(ori_bz, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                input_tensor = input_tensor[:, :-chunk_pad_size, :]
+
+        return input_tensor, masks  # , layer_emb
+
+
+class WindowQformer(nn.Module):
+    """Window-level Qformer"""
+
+    def __init__(
+        self,
+        window_size: int = 8,
+        num_queries: int = 1,
+        num_blocks: int = 2,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+
+        self.decoders = nn.ModuleList([
+            nn.TransformerDecoderLayer(
+                d_model=attention_dim,
+                nhead=attention_heads,
+                dim_feedforward=linear_units,
+                dropout=dropout_rate,
+                activation="relu",
+                batch_first=True,
+                norm_first=normalize_before,  # TODO need to verify
+            ) for _ in range(num_blocks)
+        ])
+
+        self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
+        self.after_norm = (nn.LayerNorm(attention_dim, eps=1e-12)
+                           if normalize_before else None)
+        self.window_size = window_size
+
+    def forward(self, audio_embed, mask, embed_len=None):
+        """forward decoder"""
+        # audio_embed: N x T x D => N x D x T
+
+        audio_embed = audio_embed.transpose(1, 2)
+        # audio_embed: N x D x 1 x T => N x DK x T'
+        padding = audio_embed.shape[-1] % self.window_size
+        if padding > 0:
+            audio_embed = F.pad(audio_embed, (0, self.window_size - padding),
+                                "constant", 0)
+
+        embed_chunk = F.unfold(
+            audio_embed[..., None, :],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.window_size),
+        )
+        bsz, _, slen = embed_chunk.shape
+        # N x D x K x T'
+        embed_chunk = embed_chunk.view(bsz, -1, self.window_size, slen)
+        # N x T' x K x D
+        embed_chunk = embed_chunk.transpose(1, 3).contiguous()
+        # NT' x K x D
+        embed_chunk = embed_chunk.view(bsz * slen, self.window_size, -1)
+        # NT' x 1 x D
+        q = self.queries.expand(bsz * slen, -1, -1)
+        for layer in self.decoders:
+            q = layer(tgt=q,
+                      memory=embed_chunk,
+                      tgt_mask=None,
+                      memory_mask=mask)
+
+        if self.after_norm is not None:
+            q = self.after_norm(q)
+
+        if embed_len is not None:
+            embed_len = embed_len // self.window_size
+        # N x T' x D
+        out = q.view(bsz, slen, -1)
+
+        return out, embed_len
+
+
+class AudioEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        # n_embed or hidden_size for text LM
+        hidden_size = (config.n_embd
+                       if hasattr(config, "n_embd") else config.hidden_size)
+
+        # self.wte = nn.Embedding(config.vocab_size, hidden_size)
+
+        audio_dim_out = (
+            None  # Set this variable according to the actual audio processor
+        )
+        self.layer_idx = -2
+
+        if (isinstance(config.audio_processor, dict)
+                and config.audio_processor.get("name", None) == "cascades"):
+            encoder_config = config.audio_processor.get("config", None)
+            assert encoder_config is not None
+            self.encoder = ConformerEncoder(**encoder_config)
+
+            audio_dim_out = encoder_config["attention_dim"]
+            n_mels = encoder_config["input_size"]
+        else:
+            raise NotImplementedError("")
+
+        assert (audio_dim_out
+                is not None), "Remember to set values for audio_dim_out"
+        self.audio_dim_out = audio_dim_out
+        self.audio_dim_in = n_mels
+
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor",
+                                                 False)
+
+        self.downsample_rate = kwargs.get("downsample_rate", 1)
+
+        if kwargs.get("use_qformer", False):
+            qformer_config = kwargs.get("qformer_config", {})
+            qformer_config["attention_dim"] = audio_dim_out
+            self.qformer = WindowQformer(**qformer_config)
+        else:
+            self.qformer = None
+
+        if kwargs.get("use_conv_downsample", False):
+            assert (self.qformer is None
+                    ), "don't support use qformer and conv downsample together"
+            nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.downsample_rate,
+                "feat_in": audio_dim_out,
+                "feat_out": audio_dim_out,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.conv_ds = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            self.conv_ds = None
+
+        projection_cls = kwargs.get("projection_cls", "linear")
+        if projection_cls == "linear":
+            self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
+        elif projection_cls == "mlp":
+            # follow llava-v1.5's implementation
+            # (do not use image_projection and image_proj_norm)
+            dim_projection = hidden_size
+            depth = 2
+            self.linear_downsample_rate = (1 if (self.qformer or self.conv_ds)
+                                           else self.downsample_rate)
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection = nn.Sequential(*layers)
+            # NOTE vision-speech tasks use a separate projection layer
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection_for_vision = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f"projection_cls = {projection_cls}, not implemented")
+
+        # TODO: audio sequence compression - Qformer
+        self.vocab_size = config.vocab_size
+        self.input_embeds = None
+        self.audio_embed_sizes = None
+
+    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+        self.input_embeds = input_embeds
+
+    def set_audio_embed_sizes(self,
+                              audio_embed_sizes: torch.LongTensor) -> None:
+        self.audio_embed_sizes = audio_embed_sizes
+
+    def get_audio_features(
+        self,
+        input_embeds: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ):
+
+        if self.freeze_audio_processor:
+            with torch.no_grad():
+                audio_features, masks = self.encoder(input_embeds,
+                                                     audio_attention_mask)
+        else:
+            audio_features, masks = self.encoder(input_embeds,
+                                                 audio_attention_mask)
+
+        if self.qformer is not None:
+            audio_features, _ = self.qformer(audio_features, mask=None)
+
+        if self.conv_ds is not None:
+            if masks is not None:
+                masks = masks.squeeze(1)
+
+            audio_features, masks = self.conv_ds(audio_features, mask=masks)
+
+        if self.linear_downsample_rate != 1:
+            bs, seq_len, feat_dim = audio_features.size()
+            padding = seq_len % self.linear_downsample_rate
+            if padding > 0:
+                audio_features = F.pad(
+                    audio_features,
+                    (0, 0, 0, self.linear_downsample_rate - padding),
+                    "constant",
+                    0,
+                )
+
+            seq_len = audio_features.size(1)
+            audio_features = audio_features.view(
+                bs,
+                seq_len // self.linear_downsample_rate,
+                feat_dim * self.linear_downsample_rate,
+            )
+
+        if audio_projection_mode == 'speech':
+            audio_set_tensor = self.audio_projection(audio_features)
+        elif audio_projection_mode == 'vision':
+            audio_set_tensor = self.audio_projection_for_vision(audio_features)
+        else:
+            raise ValueError(
+                f"audio_projection_mode = {audio_projection_mode} not "\
+                    "implemented"
+            )
+
+        return audio_set_tensor
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeds: torch.FloatTensor,
+        audio_embed_sizes,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_ids: input text ids (B, U)
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
+        assert input_embeds is not None and len(input_embeds) == len(
+            audio_embed_sizes)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        with torch.no_grad():
+            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
+                as_tuple=False)
+
+        if not isinstance(input_embeds, list):
+            input_embeds = [input_embeds]
+
+        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
+        audio_set_tensor = [
+            self.get_audio_features(
+                input_embed, audio_projection_mode=audio_projection_mode)
+            for input_embed in input_embeds
+        ]
+
+        with torch.no_grad():
+            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+
+        if "wte" in kwargs:
+            # we use the token embedding layer from the huggingface model, this
+            # is REQUIRED to make sure we are using the loaded weights.
+            hidden_states = kwargs["wte"](input_ids)
+        else:
+            # otherwise, we use token embedding in pretrained mixformer from
+            # phi team
+            hidden_states = self.wte(input_ids)
+
+        if len(positions.tolist()) > 0:
+            assert sum(audio_embed_sizes) == len(
+                positions
+            ), "please ensure the encoder outputs have the same length as"\
+                " defined in input_ids!"
+            idx = 0
+            for i in range(len(audio_embed_sizes)):
+                cnt = audio_embed_sizes[i]
+                assert audio_set_tensor[i].shape[0] == 1
+                hidden_states[
+                    positions[idx, 0],
+                    positions[idx, 1]:positions[idx, 1] + cnt,
+                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
+                    hidden_states.dtype).to(hidden_states.device))
+                idx += cnt
+
+        return hidden_states
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
new file mode 100644
index 000000000000..ca00207a9b6f
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -0,0 +1,1880 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class Block(nn.Module):
+    """Block abstract module"""
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "gelu":
+        return nn.GELU()
+    if name == "swish":
+        return Swish()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. 
+        It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for 
+        chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+            Torch 1.0.1
+            tensor([[1., 1., 0., 0.],
+                    [0., 1., 1., 0.],
+                    [0., 0., 1., 1.]])
+            Torch 1.4.1
+            tensor([[True., True., False., False.],
+                    [False., True., True., False.],
+                    [False., False., True., True.]])
+    """
+    chunk_start_idx = torch.Tensor(chunk_start_idx).long(
+    )  # first idx of each chunk, such as [0,18,36,48].
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx,
+        (1, 0))  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0,
+                             x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) &
+           (seq_range >= start_pad)).nonzero()[:, 1]  # idx size: [x_len]
+    # boundary = end_pad[idx]  # boundary size: [x_len]
+    seq_range_expand = (torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+                        )  # seq_range_expand size [x_len, x_len]
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Swish(nn.Module):
+    """Implement Swish activation module.
+    From https://arxiv.org/pdf/2005.03191.pdf
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.act_fn = nn.Sigmoid()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply Swish function
+
+        Args:
+            x: torch.Tensor
+                Input.
+        """
+        return x * self.act_fn(x)
+
+
+class GLU(nn.Module):
+    """Implement Gated Linear Unit (GLU) module"""
+
+    def __init__(self, dim: int = -1, act_name: str = "sigmoid") -> None:
+        super().__init__()
+        self.dim = dim
+        self.act_name = act_name.lower()
+
+        if self.act_name == "relu":
+            self.act_fn = nn.ReLU(inplace=True)
+        elif self.act_name == "gelu":
+            self.act_fn = nn.GELU()
+        elif self.act_name == "swish":
+            self.act_fn = Swish()
+        elif self.act_name == "sigmoid":
+            self.act_fn = nn.Sigmoid()
+        else:
+            self.act_fn = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """GLU forward
+        Apply Swish function on the first half of input matrices
+        with sigmoid of the second half.
+
+        Args:
+            x: torch.Tensor
+                Input.
+
+        """
+        half_x, gate = x.chunk(2, dim=self.dim)
+        return half_x * self.act_fn(gate)
+
+
+# TODO: Abdel, this can be improved using GLU module
+class GLUPointWiseConv(nn.Module):
+    """GLUPointWiseConv module
+    used for conformer architecture,
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        output_dim: int
+            output channel size.
+        kernel_size: int
+            kernel size
+        glu_type: str, optional
+            activation function one of
+             ["sigmoid", "relu", "gelu"]
+              default "sigmoid".
+        bias_in_glu: bool, optional
+            use addtive bias in glu
+        causal: bool, optional
+            if set to True, padding is set to the half of
+             kernel size, ie, convolution can't see future frames.
+              default False.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        causal=False,
+    ):
+        super().__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        self.bias_in_glu = bias_in_glu
+        if causal:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1),
+            )
+        else:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            )
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        else:
+            raise ValueError(f"Unsupported activation type {self.glu_act}")
+
+        if bias_in_glu:
+            self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
+            self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        # to be consistent with GLULinear, we assume the input always has the
+        # #channel (#dim) in the last dimension of the tensor, so need to
+        # switch the dimension first for 1D-Conv case
+        x = x.permute([0, 2, 1])
+        x = self.ext_pw_conv_1d(x)
+        if self.glu_type == "bilinear":
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * (
+                    x[:, self.output_dim:self.output_dim * 2, :])
+        else:
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :])
+
+        x = x.permute([0, 2, 1])
+        return x
+
+
+class DepthWiseSeperableConv1d(nn.Module):
+    """DepthWiseSeperableConv1d module used in Convnet module
+    for the conformer, for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a channel_out
+             of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        kernel_size: int
+            kernel_size
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+            will be used to compute the hidden channels of the Conv1D.
+        padding: int, optional
+            padding for the conv1d,
+             default: 0.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        depthwise_seperable_out_channel,
+        kernel_size,
+        depthwise_multiplier,
+        padding=0,
+    ):
+        super().__init__()
+
+        self.dw_conv = nn.Conv1d(
+            input_dim,
+            input_dim * depthwise_multiplier,
+            kernel_size,
+            1,
+            padding=padding,
+            groups=input_dim,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            self.pw_conv = nn.Conv1d(
+                input_dim * depthwise_multiplier,
+                depthwise_seperable_out_channel,
+                1,
+                1,
+                0,
+            )
+        else:
+            self.pw_conv = nn.Identity()
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        x = self.dw_conv(x)
+        if self.depthwise_seperable_out_channel != 0:
+            x = self.pw_conv(x)
+        return x
+
+
+class ConvModule(nn.Module):
+    """ConvModule Module for the conformer block.
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel
+             will be used as a channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation.
+            default False
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+        chunk_size: int, optional
+            chunk size for cnn. default 18
+        activation: str, optional
+            activation function used in ConvModule,
+            default: "relu".
+        glu_type: str, optional
+            activation function used for the glu,
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        export: bool, optional,
+            if set to True, padding is equal to 0.  This is for inference,
+             or onnx export.  Typically this is set by the export program or
+             the decoder program, and it isn't present in your config file.
+             default False
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        ext_pw_out_channel,
+        depthwise_seperable_out_channel,
+        ext_pw_kernel_size,
+        kernel_size,
+        depthwise_multiplier,
+        dropout_rate,
+        causal=False,
+        batch_norm=False,
+        chunk_se=0,
+        chunk_size=18,
+        activation="relu",
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        export=False,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.input_dim = input_dim
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.ext_pw_kernel_size = ext_pw_kernel_size
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.glu_type = glu_type
+        self.bias_in_glu = bias_in_glu
+        self.linear_glu_in_convm = linear_glu_in_convm
+        self.causal = causal
+
+        self._add_ext_pw_layer()
+
+        self.batch_norm = batch_norm
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.bn_layer = nn.BatchNorm1d(input_dim)
+
+        self.act = get_activation(activation)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.export = export
+
+        if causal:
+            padding = 0 if export else kernel_size - 1
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+            input_dim,
+            depthwise_seperable_out_channel,
+            kernel_size,
+            depthwise_multiplier,
+            padding=padding,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            if input_dim != depthwise_seperable_out_channel:
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel,
+                                     input_dim)
+        else:
+            if depthwise_multiplier != 1:
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier,
+                                     input_dim)
+
+    def _add_ext_pw_layer(self):
+        """
+        This function is an extension of __init__ function
+        and dedicated to the convolution module creation
+        of the conformer.
+        """
+        self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
+            nn.Identity())  # jit hacks.
+        self.squeeze_excitation = nn.Identity()  # jit.
+        self.apply_ln1 = self.fix_len1 = False  # jit.
+
+        if self.ext_pw_out_channel != 0:
+            if self.causal:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1),
+                )
+                if self.ext_pw_kernel_size > 1:
+                    self.fix_len1 = True
+                else:
+                    self.fix_len1 = False
+            else:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1) // 2,
+                )
+                self.fix_len1 = False
+
+            if self.linear_glu_in_convm:
+                self.glu = GLULinear(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.glu_type,
+                    self.bias_in_glu,
+                )
+            else:
+                self.glu = GLUPointWiseConv(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    self.glu_type,
+                    self.bias_in_glu,
+                    self.causal,
+                )
+
+            if self.input_dim != self.ext_pw_out_channel:
+                self.apply_ln1 = True
+                self.ln1 = nn.Linear(self.ext_pw_out_channel, self.input_dim)
+            else:
+                self.apply_ln1 = False
+        else:
+            self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
+            self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
+
+    def forward(self, x):
+        """ConvModule Forward.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        x = self.layer_norm(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.glu(x)
+            if self.causal and self.ext_pw_kernel_size > 1:
+                x = x[:, :-(self.ext_pw_kernel_size - 1), :]
+            if self.apply_ln1:
+                x = self.ln1(x)
+        else:
+            x_0 = x * self.pw_conv_simplify_w[0] + self.pw_conv_simplify_b[0]
+            x_1 = x * self.pw_conv_simplify_w[1] + self.pw_conv_simplify_b[1]
+            x = x_0 + x_1
+
+        x = x.permute([0, 2, 1])
+
+        x = self.dw_sep_conv_1d(x)
+        if self.causal and self.kernel_size > 1:
+            x = x[:, :, :-(self.kernel_size - 1)]
+        if hasattr(self, "ln2"):
+            x = x.permute([0, 2, 1])
+            x = self.ln2(x)
+            x = x.permute([0, 2, 1])
+        if self.batch_norm:
+            x = self.bn_layer(x)
+        x = self.act(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.ext_pw_conv_1d(x)
+            if self.fix_len1:
+                x = x[:, :, :-(self.ext_pw_kernel_size - 1)]
+
+            if self.apply_ln1:
+                x = x.permute([0, 2, 1])
+                x = self.ln1(x)
+                x = x.permute([0, 2, 1])
+
+            x = x.permute([0, 2, 1])
+        else:
+            x = x.unsqueeze(1).permute([0, 1, 3, 2])
+            x = x * self.pw_conv_simplify_w[2] + self.pw_conv_simplify_b[2]
+            x = x.squeeze(1)
+
+        x = self.dropout(x)
+        return x
+
+
+class GLULinear(nn.Module):
+    """Linear + GLU module
+
+    Args:
+        input_dim: int
+            input size
+        output_dim: int
+            output size.
+        glu_type:
+            activation function name used in glu module.
+            default "sigmoid" (swish function).
+        bias_in_glu: bool, optional
+            If True, the addtive bias is added. Default False.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
+        self.glu_act = GLU(-1, glu_type)
+
+    def forward(self, x):
+        """GLULinear forward
+
+        Args:
+            x: torch.Tensor
+                inpute tensor.
+        """
+        x = self.linear(x)
+        return self.glu_act(x)
+
+
+class FeedForward(nn.Module):
+    """FeedForward Module.
+    For more details see Conformer paper:
+        https://arxiv.org/pdf/2005.08100.pdf
+
+    Args:
+        d_model: int
+            input size.
+        d_inner: int
+            output size.
+        dropout_rate: float,
+            dropout rate.
+        activation: str,
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "sigmoid".
+        bias_in_glu: bool, optional
+    """
+
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        dropout_rate,
+        activation="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        module = GLULinear(d_model, d_inner, activation, bias_in_glu)
+        self.net = nn.Sequential(
+            module,
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout_rate),
+        )
+
+    def forward(self, x):
+        """FeedForward forward function.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        out = self.net(self.layer_norm(x))
+
+        return out
+
+
+#### positional encoding starts here
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward 
+        compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+
+class T5RelativeAttentionLogitBias(nn.Module):
+    """
+    This module implements the relative position bias described in Section 
+    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
+
+    The Huggingface implementation is used as a reference
+    https://github.com/huggingface/transformers/blob/v4.30.0/src/
+    transformers/models/t5/modeling_t5.py#L435
+
+    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
+    on relative position of the query and key. It is HxNxN, where H is the 
+    number of heads, N is the sequence length.
+
+    I've made these modifications to the original T5 bias:
+    - Skipping of the bucketing step. Original T5 bias converted rel 
+      position distances into logarithmically increasing buckets. This is 
+      supposed to help with length generalization.
+    - I just directly use rel position index as bias values, as we don't 
+      need length generalization (40s max is good enough for ASR encoder), 
+      and it keeps ONNX export simple.
+    - I've also extended it so that biases can be asymmetric, the default 
+      implementation treats L->R and R->L the same. Asymmetric was found to 
+      yield better results in my experiments.
+
+    Args:
+        num_heads: int
+            Number of attention heads
+        num_buckets: int
+            Number of buckets to use for relative attention bias. This is the
+            size of the learnable bias parameter. Bucketing is not yet 
+            supported, so this defaults to -1 which means no bucketing is
+            used (max_distance determines size of bias param).
+        max_distance: int
+            Maximum distance to use for relative attention bias. With 
+            num_buckets=-1, this directly controls the max size of the bias 
+            parameter. When num_buckets > 0 is supported, this will control 
+            the maximum distance for logarithmic bucketing after which all 
+            positions are in the same bucket.
+        symmetric: bool
+            Whether to use symmetric or asymmetric biases. symmetric=False uses
+            2x number of bias params to distinguish L->R from R->L. This was 
+            found to be better for the encoder.
+    """
+
+    def __init__(self,
+                 num_heads,
+                 num_buckets=-1,
+                 max_distance=1000,
+                 symmetric=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.symmetric = symmetric
+        self._skip_bucketing = self.num_buckets < 0
+        if self._skip_bucketing:
+            self.num_buckets = max_distance
+        else:
+            raise NotImplementedError(
+                "T5 attention bias with bucketed positions is not yet tested")
+        if not self.symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        maxpos = x.size(1)
+        context_position = torch.arange(maxpos,
+                                        device=x.device,
+                                        dtype=torch.long)[:, None]
+        memory_position = torch.arange(maxpos,
+                                       device=x.device,
+                                       dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX
+        # export
+        relative_position = relative_position.masked_fill(
+            relative_position < -self.max_distance, -self.max_distance)
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1)
+
+        # mapping from relative position to index in the bias parameter
+        if self._skip_bucketing:
+            bias_idx = relative_position
+        else:
+            bias_idx = self._bucket_relative_position(relative_position)
+        if self.symmetric:
+            bias_idx = bias_idx.abs()
+        else:
+            bias_idx += self.num_buckets // 2
+
+        t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(
+            0)  # [1, H, L, L]
+
+        return t5_rel_att_bias
+
+    def _bucket_relative_position(self, relative_position):
+        # this is a placeholder (isn't tested, likely buggy) using HuggingFace
+        # implem as a reference this also needs to be extended to support
+        # asymmetric +/- ve positions
+        relative_buckets = 0
+        if not self.causal:
+            self.num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * self.num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = self.num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact) /
+            math.log(self.max_distance / max_exact) *
+            (self.num_buckets - max_exact)).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, self.num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_position_if_large)
+        return relative_buckets
+
+
+class AbsolutePositionalEncoding(nn.Module):
+    """Absolute Positional encoding module.
+    This module implement Absolute sinusoidal positional encoding
+    from: https://arxiv.org/pdf/1706.03762.pdf
+
+    Args:
+        d_model: int
+            Input embedding size.
+        dropout_rate: float
+            dropout rate
+        max_len: int, optional
+            Maximum input length sequence, Default 5000
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings.
+
+        Args:
+            x: torch.Tensor
+        """
+        if self.pe is not None and self.pe.size(1) >= x.size(1):
+            if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+            return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x: torch.Tensor
+                Input tensor. shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+
+
+#### forward embedding layers starts here
+class MeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.global_mean = nn.Parameter(torch.zeros(input_size))
+        self.global_invstd = nn.Parameter(torch.ones(input_size))
+
+    def forward(self, input_: Tensor) -> Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class CausalConv1D(nn.Conv1d):
+    """
+    A causal version of nn.Conv1d where each step would have limited access to
+    locations on its right or left
+    All arguments are the same as nn.Conv1d except padding.
+
+    If padding is set None, then paddings are set automatically to make it a 
+    causal convolution where each location would not see any steps on its right.
+
+    If padding is set as a list (size of 2), then padding[0] would be used as 
+    left padding and padding[1] as right padding.
+    It would make it possible to control the number of steps to be accessible
+    on the right and left.
+    This mode is not supported when stride > 1. padding[0]+padding[1] should 
+    be equal to (kernel_size - 1).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.cache_drop_size = None
+        if padding is None:
+            self._left_padding = kernel_size - 1
+            self._right_padding = stride - 1
+        else:
+            if stride != 1 and padding != kernel_size - 1:
+                raise ValueError(
+                    "No striding allowed for non-symmetric convolutions!")
+            if isinstance(padding, int):
+                self._left_padding = padding
+                self._right_padding = padding
+            elif (isinstance(padding, list) and len(padding) == 2
+                  and padding[0] + padding[1] == kernel_size - 1):
+                self._left_padding = padding[0]
+                self._right_padding = padding[1]
+            else:
+                raise ValueError(f"Invalid padding param: {padding}!")
+
+        self._max_cache_len = self._left_padding
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def update_cache(self, x, cache=None):
+        if cache is None:
+            new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
+            next_cache = cache
+        else:
+            new_x = F.pad(x, pad=(0, self._right_padding))
+            new_x = torch.cat([cache, new_x], dim=-1)
+            if self.cache_drop_size > 0:
+                next_cache = new_x[:, :, :-self.cache_drop_size]
+            else:
+                next_cache = new_x
+            next_cache = next_cache[:, :, -cache.size(-1):]
+        return new_x, next_cache
+
+    def forward(self, x, cache=None):
+        x, cache = self.update_cache(x, cache=cache)
+        x = super().forward(x)
+        if cache is None:
+            return x
+        else:
+            return x, cache
+
+
+class CausalConv2D(nn.Conv2d):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be 
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2D.")
+        self._left_padding = kernel_size - 1
+        self._right_padding = stride - 1
+
+        padding = 0
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+
+    def forward(
+        self,
+        x,
+    ):
+        x = F.pad(
+            x,
+            pad=(self._left_padding, self._right_padding, 0, 0),
+        )
+        x = super().forward(x)
+        return x
+
+
+class NemoConvSubsampling(torch.nn.Module):
+    """Convlutional subsampling module, taken from NeMo ASR
+    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
+    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
+
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence 
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong 
+    et al. (https://ieeexplore.ieee.org/document/8462506)
+
+
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a 
+    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
+    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
+      layer is kept as a regular convolution so as not to degrade accuracy.
+
+    `Striding` and `dw_striding` are the same except that the latter uses 
+    depthwise convolutions after the first layer, whereas the former does not.
+
+    Args:
+        subsampling_factor (int): Time reduction factor
+        feat_in (int): size of the input features
+        feat_out (int): size of the output features
+        subsampling (str): The subsampling technique, choose from
+            {"striding", "dw-striding", "striding_conv1d", 
+            "dw_striding_conv1d"}
+        conv_channels (int): Number of channels for the convolution layers, 
+                            default is 256.
+        subsampling_conv_chunking_factor (int): Input chunking factor which 
+            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
+        activation (Module): activation function, default is nn.ReLU()
+        is_causal (bool): whether to use causal Conv1/2D, where each step will
+            have limited access to locations on its right or left
+    """
+
+    def __init__(
+            self,
+            feat_in,
+            feat_out,
+            subsampling_factor=4,
+            subsampling="dw_striding",
+            conv_channels=256,
+            subsampling_conv_chunking_factor=1,
+            activation=nn.ReLU(),  # noqa: B008
+            is_causal=False,
+    ):
+        super().__init__()
+        self._subsampling = subsampling
+        self._conv_channels = conv_channels
+        self._feat_in = feat_in
+        self._feat_out = feat_out
+
+        if subsampling_factor % 2 != 0:
+            raise ValueError("Sampling factor should be a multiply of 2!")
+        self._sampling_num = int(math.log(subsampling_factor, 2))
+        self.subsampling_factor = subsampling_factor
+        self.is_causal = is_causal
+        self.subsampling_causal_cond = subsampling in (
+            "dw_striding",
+            "striding",
+            "striding_conv1d",
+        )
+
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = \
+            subsampling_conv_chunking_factor
+
+        in_channels = 1
+        layers = []
+
+        if subsampling == "dw_striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            # Layer 1
+            if self.is_causal:
+                layers.append(
+                    CausalConv2D(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=None,
+                    ))
+            else:
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                    ))
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                            groups=in_channels,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ))
+
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv1D(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "dw_striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            self._left_padding = (self._kernel_size - 1) // 2
+            self._right_padding = (self._kernel_size - 1) // 2
+
+            # Layer 1
+            layers.extend([
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=self._kernel_size,
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=in_channels,
+                ),
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=(feat_out if self._sampling_num == 1 else
+                                  conv_channels),
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                ),
+            ])
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend([
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                        groups=in_channels,
+                    ),
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=(feat_out if self._sampling_num == i +
+                                      2 else conv_channels),
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ),
+                ])
+                layers.append(activation)
+                in_channels = conv_channels
+
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        if subsampling in ["dw_striding", "striding"]:
+            in_length = torch.tensor(feat_in, dtype=torch.float)
+            out_length = calc_length(
+                lengths=in_length,
+                all_paddings=self._left_padding + self._right_padding,
+                kernel_size=self._kernel_size,
+                stride=self._stride,
+                ceil_mode=self._ceil_mode,
+                repeat_num=self._sampling_num,
+            )
+            self.out = torch.nn.Linear(conv_channels * int(out_length),
+                                       feat_out)
+            self.conv2d_subsampling = True
+        elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
+            self.out = None
+            self.conv2d_subsampling = False
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        self.conv = torch.nn.Sequential(*layers)
+
+    def get_sampling_frames(self):
+        return [1, self.subsampling_factor]
+
+    def get_streaming_cache_size(self):
+        return [0, self.subsampling_factor + 1]
+
+    def forward(self, x, mask):
+        """
+        Forward method for NeMo subsampling.
+
+        Args:
+            x[Batch, Time, Filters]: torch.Tensor
+                input tensor
+            x_mask: torch.Tensor
+                input mask
+
+        Returns:
+            x: torch.Tensor
+                Resulting tensor from subsampling (B, T // 
+                time_reduction_factor, feat_out)
+            pad_mask: torch.Tensor
+                tensor of padded hidden state sequences (B, 1, T // 
+                time_reduction_factor)
+        """
+        x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
+
+        # split inputs if chunking_factor is set
+        if (self.subsampling_conv_chunking_factor != -1
+                and self.conv2d_subsampling):
+            if self.subsampling_conv_chunking_factor == 1:
+                # if subsampling_conv_chunking_factor is 1, we split only
+                # if needed.
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31.
+                # see https://github.com/pytorch/pytorch/issues/80020
+                x_ceil = (2**31 / self._conv_channels * self._stride *
+                          self._stride)
+                need_to_split = torch.numel(x) > x_ceil
+            else:
+                # if subsampling_conv_chunking_factor > 1 we always split
+                need_to_split = True
+
+            if need_to_split:
+                x, success = self.conv_split_by_batch(x)
+                if not success:  # if unable to split by batch, try by channel
+                    if self._subsampling == "dw_striding":
+                        x = self.conv_split_by_channel(x)
+                    else:
+                        x = self.conv(x)  # try anyway
+            else:
+                x = self.conv(x)
+        else:
+            x = self.conv(x)
+
+        # Flatten Channel and Frequency Axes
+        if self.conv2d_subsampling:
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
+        # Transpose to Channel Last mode
+        else:
+            x = x.transpose(1, 2)
+
+        if mask is None:
+            return x, None
+
+        max_audio_length = x.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        if self.is_causal and self.subsampling_causal_cond:
+            feature_lens_remainder = feature_lens % self.subsampling_factor
+            padding_length[feature_lens_remainder != 1] += 1
+        pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
+            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        return x, pad_mask.unsqueeze(1)
+
+    def reset_parameters(self):
+        # initialize weights
+        if self._subsampling == "dw_striding":
+            with torch.no_grad():
+                # init conv
+                scale = 1.0 / self._kernel_size
+                dw_max = (self._kernel_size**2)**-0.5
+                pw_max = self._conv_channels**-0.5
+
+                torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
+                torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
+
+                for idx in range(2, len(self.conv), 3):
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max,
+                                           pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max,
+                                           pw_max)
+
+                # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
+                # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
+                # src/models/conformer_encoder.py#L487
+                fc_scale = (self._feat_out * self._feat_in /
+                            self._sampling_num)**-0.5
+                torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
+                torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
+
+    def conv_split_by_batch(self, x):
+        """Tries to split input by batch, run conv and concat results"""
+        b, _, _, _ = x.size()
+        if b == 1:  # can't split if batch size is 1
+            return x, False
+
+        if self.subsampling_conv_chunking_factor > 1:
+            cf = self.subsampling_conv_chunking_factor
+        else:
+            # avoiding a bug / feature limiting indexing of tensors to 2**31
+            # see https://github.com/pytorch/pytorch/issues/80020
+            x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+            p = math.ceil(math.log(torch.numel(x) / x_ceil, 2))
+            cf = 2**p
+
+        new_batch_size = b // cf
+        if new_batch_size == 0:  # input is too big
+            return x, False
+
+        return (
+            torch.cat([
+                self.conv(chunk)
+                for chunk in torch.split(x, new_batch_size, 0)
+            ]),
+            True,
+        )
+
+    def conv_split_by_channel(self, x):
+        """For dw convs, tries to split input by time, run conv and concat 
+        results"""
+        x = self.conv[0](x)  # full conv2D
+        x = self.conv[1](x)  # activation
+
+        for i in range(self._sampling_num - 1):
+            _, c, t, _ = x.size()
+
+            if self.subsampling_conv_chunking_factor > 1:
+                cf = self.subsampling_conv_chunking_factor
+            else:
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31
+                # see https://github.com/pytorch/pytorch/issues/80020
+                p = math.ceil(math.log(torch.numel(x) / 2**31, 2))
+                cf = 2**p
+
+            new_c = int(c // cf)
+            if new_c == 0:
+                new_c = 1
+
+            new_t = int(t // cf)
+            if new_t == 0:
+                new_t = 1
+
+            x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c,
+                                          x)  # conv2D, depthwise
+
+            # splitting pointwise convs by time
+            x = torch.cat(
+                [
+                    self.conv[i * 3 + 3](chunk)
+                    for chunk in torch.split(x, new_t, 2)
+                ],
+                2,
+            )  # conv2D, pointwise
+            x = self.conv[i * 3 + 4](x)  # activation
+        return x
+
+    def channel_chunked_conv(self, conv, chunk_size, x):
+        """Performs channel chunked convolution"""
+
+        ind = 0
+        out_chunks = []
+        for chunk in torch.split(x, chunk_size, 1):
+            step = chunk.size()[1]
+
+            if self.is_causal:
+                chunk = nn.functional.pad(
+                    chunk,
+                    pad=(
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                    ),
+                )
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=0,
+                    groups=step,
+                )
+            else:
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=step,
+                )
+            out_chunks.append(ch_out)
+            ind += step
+
+        return torch.cat(out_chunks, 1)
+
+    def change_subsampling_conv_chunking_factor(
+            self, subsampling_conv_chunking_factor: int):
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+
+def calc_length(lengths,
+                all_paddings,
+                kernel_size,
+                stride,
+                ceil_mode,
+                repeat_num=1):
+    """Calculates the output length of a Tensor passed through a convolution or
+      max pooling layer"""
+    add_pad: float = all_paddings - kernel_size
+    one: float = 1.0
+    for i in range(repeat_num):
+        lengths = (torch.div(lengths.to(dtype=torch.float) + add_pad, stride) +
+                   one)
+        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
+    return lengths.to(dtype=torch.int)
+
+
+####  multihead attention starts here
+class AttModule(nn.Module):
+    """Attention abstraction module"""
+
+    def __init__(self):
+        super().__init__()
+        self.export_mode = False
+
+    def set_export(self, mode=True):
+        """set the export mode"""
+        self.export_mode = mode
+
+    def forward(
+        self,
+        x: Tensor,
+        memory: Optional[Tensor] = None,
+        pos_emb: Optional[Tensor] = None,
+        att_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        """AttModule forward
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+            memory: torch.Tensor, optional
+                memory tensor.
+            pos_emb: torch.Tensor, optional
+                positional encoder embedding.
+            att_mask: torch.Tensor, optional
+                attention mask tensor.
+        """
+        return x, memory, pos_emb, att_mask
+
+
+class AttBlock(Block, AttModule):
+    """Attention Block module to support both Attention and Block module."""
+
+    def memory_dims(self, max_len=False):
+        """memory dimensions"""
+        return (1, self.input_size)
+
+
+def masked_softmax(
+    scores,
+    mask: Optional[Tensor],
+):
+    if mask is not None:
+        mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+        scores = scores.masked_fill(mask, -torch.inf)
+        attn = torch.softmax(scores, dim=-1).masked_fill(
+            mask, 0.0)  # (batch, head, time1, time2)
+    else:
+        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+    return attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer with optional relative position embedding 
+    and GLU.
+
+    Args:
+        n_head: int
+            the number of heads.
+        n_feat: int
+            input size features.
+        dropout_rate: float
+            dropout rate.
+        use_LN: bool
+            apply layer norm or not
+        dropout_at_output: bool
+            whether to apply dropout at output
+        attention_inner_dim: int, optional
+            the attention dimension used in the class,
+            it can be different from the input dimension n_feat.
+            default: -1 (equal to n_feat).
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set True, use pytorch scaled dot product attention in training.
+            NOTE: this will NOT be used in ONNX decoding due to a lack of 
+            support.  In that case, we use the original attention 
+            implementation, which shows no regression.
+            default: False.
+        n_value: int, optional
+            if set to values other than -1, use a different dimension for 
+            value. With the default value (i.e. -1), it is backward compatible.
+        group_size: int, optional. must divide `n_head`
+            if group_size > 1:       GQA
+            if group_size = 1:       MHA
+            if group_size = n_head:  MQA
+    """
+
+    inv_sqrt_d_k: torch.jit.Final[float]
+    h: torch.jit.Final[int]
+    h_k: torch.jit.Final[int]
+    g: torch.jit.Final[int]
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        attention_inner_dim=-1,
+        glu_type="swish",
+        bias_in_glu=True,
+        use_pt_scaled_dot_product_attention=False,
+        n_value=-1,
+        group_size: int = 1,
+    ):
+        super().__init__()
+        if n_value == -1:
+            n_value = n_feat
+        if attention_inner_dim == -1:
+            attention_inner_dim = n_feat
+        assert attention_inner_dim % n_head == 0
+
+        # We assume d_v always equals d_k
+        self.d_k = attention_inner_dim // n_head
+        self.inv_sqrt_d_k = 1.0 / math.sqrt(self.d_k)
+        self.h = n_head
+        assert n_head % group_size == 0, "group_size must divide n_head"
+        self.g = group_size
+        self.h_k = n_head // group_size
+
+        self.linear_q = nn.Linear(n_feat, attention_inner_dim)
+        self.linear_k = nn.Linear(n_feat, attention_inner_dim // group_size)
+        self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
+        self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
+
+        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.dropout_rate = dropout_rate
+        self.use_pt_scaled_dot_product_attention = (
+            use_pt_scaled_dot_product_attention)
+
+        if use_pt_scaled_dot_product_attention and group_size > 1:
+            raise ValueError("Cannot use PT Scaled Attention with GQA")
+
+        # Torchscript eager quantization.  Note that these functions below are
+        # NOOPs and have very little impact on performance unless quantization
+        # is enabled.
+        self.quant_q = torch.ao.quantization.QuantStub()
+        self.quant_x = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+        self.ffunc = torch.ao.nn.quantized.FloatFunctional()
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        pos_k: Tensor,
+        pos_v: Tensor,
+        mask: Optional[Tensor],
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """Compute 'Scaled Dot Product Attention'.
+
+        Args:
+            query: torch.Tensor
+                query tensor (batch, time1, size)
+            key: torch.Tensor
+                key tensor (batch, time2, size)
+            value: torch.Tensor
+                value tensor (batch, time1, size)
+            pos_k: torch.Tensor
+                key tensor used for relative positional embedding.
+            pos_v: torch.Tensor
+                value tensor used for relative positional embedding.
+            mask: torch.Tensor
+                mask tensor (batch, time1, time2)
+            relative_attention_bias: torch.Tensor
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h,
+                                      self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k,
+                                    self.d_k)  # (b, t, d)
+        v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
+        q = (q.transpose(1, 2) if self.use_pt_scaled_dot_product_attention
+             and not torch.jit.is_scripting() else q.transpose(1, 2) *
+             self.inv_sqrt_d_k)
+        k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
+
+        if (self.use_pt_scaled_dot_product_attention
+                and not torch.jit.is_scripting()):
+            attn_mask = None
+            if mask is not None:
+                mask = mask.unsqueeze(1)
+                if relative_attention_bias is not None:
+                    attn_mask = mask + relative_attention_bias
+                else:
+                    attn_mask = mask
+                if mask.dtype != q.dtype:
+                    attn_mask = attn_mask.to(q.dtype)
+
+            with torch.backends.cuda.sdp_kernel(enable_flash=True,
+                                                enable_math=True,
+                                                enable_mem_efficient=True):
+                x = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.dropout_rate,
+                )
+        else:
+            if self.h != self.h_k:
+                q = q.reshape(n_batch, self.g, self.h_k, -1, self.d_k)
+                A = torch.einsum("b g h t d, b h s d -> b h t s", q, k)
+            else:
+                A = torch.matmul(q, k.transpose(-2, -1))
+            if pos_k is not None:
+                if self.h != self.h_k:
+                    B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
+                else:
+                    reshape_q = (q.contiguous().view(n_batch * self.h, -1,
+                                                     self.d_k).transpose(0, 1)
+                                 )  # (t1,nh,dk)
+                    B = torch.matmul(reshape_q,
+                                     pos_k.transpose(-2,
+                                                     -1))  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0),
+                                               pos_k.size(1))
+                scores = A + B
+            else:
+                scores = A
+
+            if relative_attention_bias is not None:
+                scores = scores + relative_attention_bias
+
+            attn = masked_softmax(scores, mask)  # (batch, head, time1, time2)
+
+            self.attn = attn
+
+            p_attn = self.dropout(attn)
+            x = torch.matmul(p_attn.to(v.dtype),
+                             v)  # (batch, head, time1, d_k)
+            if pos_v is not None:
+                reshape_attn = (p_attn.contiguous().view(
+                    n_batch * self.h, pos_v.size(0),
+                    pos_v.size(1)).transpose(0, 1))  # (t1, bh, t2)
+
+                attn_v = (torch.matmul(reshape_attn, pos_v).transpose(
+                    0, 1).contiguous().view(n_batch, self.h, pos_v.size(0),
+                                            self.d_k))
+                x = x + attn_v
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h_k * self.d_k)
+             )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential"""
+
+    @torch.jit.ignore
+    def forward(self, *args):
+        """Forward method implementation."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def get_offset(input_layer: str, time_reduction: int):
+    """Get an offset. We will use the offset for determining #frames of a 
+    subsampled feature.
+
+    Args:
+        input_layer (str): Type of an input layer
+        time_reduction (int): time reduction factor for downsampling a feature
+    Returns:
+        int: offset
+    """
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
+        return 3
+    if input_layer in ("conv2d", ) and time_reduction == 6:
+        return 1
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
+        return 7
+    return 0
+
+
+def unfold_tensor(xs_pad, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is 
+    longer than max_seq_len, this function unfold it to a 
+    (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        xs_pad: N, T, D
+    """
+    _, _, D = xs_pad.shape
+    xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    xs_pad = F.unfold(
+        xs_pad[..., None, :],
+        kernel_size=(1, max_seq_len),
+        stride=(1, max_seq_len),
+    )
+    new_bsz, _, slen = xs_pad.shape
+    # N x D x max_seq_len x T'
+    xs_pad = xs_pad.view(new_bsz, -1, max_seq_len, slen)
+    # N x T' x max_seq_len x D
+    xs_pad = xs_pad.permute(0, 3, 2, 1).contiguous()
+    # NT' x max_seq_len x D
+    xs_pad = xs_pad.view(-1, max_seq_len, D)
+    return xs_pad
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index c35c7e9fcce7..f8728acdfbfa 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -249,6 +249,7 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -272,7 +273,8 @@ def __init__(
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
-            custom_routing_function=phimoe_routing_function)
+            custom_routing_function=phimoe_routing_function,
+            prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -396,6 +398,7 @@ def __init__(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
         )
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.rms_norm_eps,
@@ -441,7 +444,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 87b1d50749a2..f17f9fb8e0c7 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -68,7 +68,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     image_token_id = mm_encoder.special_ids.img
 
     mm_config = ctx.get_mm_config()
-    num_images = mm_config.limit_per_prompt.get("image", 1)
+    num_images = mm_config.get_limit_per_prompt("image")
 
     # dummy size
     size = 256
@@ -220,7 +220,9 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input, image_tokens = self._parse_and_validate_image_input(
             **kwargs)
         if image_input is None:
@@ -1040,9 +1042,13 @@ def forward(
             for img in pixel_values
         ]
 
+        patch_embeds = [
+            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
+        ]
+        embed_sizes = [p.shape[1] for p in patch_embeds]
+
         # flatten to a single sequence
-        patch_embeds = torch.cat(
-            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = torch.cat(patch_embeds, dim=1)
         patch_embeds = self.ln_pre(patch_embeds)
 
         # positional embeddings
@@ -1073,6 +1079,8 @@ def forward(
         out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
                                              self.config.num_hidden_layers)
 
+        # squeeze dim 0 and split into separate tensors for each image
+        out = torch.split(torch.squeeze(out), embed_sizes)
         return out
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index d922329b3a49..3f5faea4f875 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -93,6 +93,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         mm_kwargs = {}
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index fe615c41aeaa..c4d02e5ddeb1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -284,7 +284,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank or (config.tie_word_embeddings
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 1c3107e76eb6..c44f4fa4d75a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -356,7 +356,9 @@ def _process_audio_input(self,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 41536b34b2f2..21855ba9dcf8 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -100,6 +100,7 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -115,7 +116,8 @@ def __init__(
                                 intermediate_size=config.moe_intermediate_size,
                                 reduce_results=False,
                                 renormalize=config.norm_topk_prob,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
@@ -277,7 +279,8 @@ def __init__(
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen2MoeSparseMoeBlock(config=config,
-                                              quant_config=quant_config)
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
         else:
             self.mlp = Qwen2MoeMLP(
                 hidden_size=config.hidden_size,
@@ -325,7 +328,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 523b53d5ee41..ac3d154dd881 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -911,8 +911,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index b8aaa7f1db1b..ff581b093b47 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -740,7 +740,9 @@ def _process_image_input(self,
 
         return self.transformer.visual(image_input["data"])
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4551d81e8a5d..74160e2d9ee4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -60,6 +60,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
     "GritLM": ("gritlm", "GritLM"),
     "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
@@ -181,6 +182,7 @@
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 0f9e517aeb55..1cae0a7fe0dc 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -269,7 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 90098af9dde0..3d11dfd77921 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -212,10 +212,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        # TODO: consider padding_idx (currently removed)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 61cfc566dd31..be788d632002 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -25,7 +25,6 @@
 from vllm.attention import Attention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.utils import divide
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
@@ -97,23 +96,12 @@ def replace_linear_class(
         "rowwise": RowParallelLinear,
     }.get(style, ReplicatedLinear)
 
-    class HFCompatibleLinear(vllm_linear_cls):
-        """
-        Wrapper class that removes `output_bias` from returned output.
-        """
-        # NOTE: The LoRA layer needs to use `parent_cls`.
-        @property
-        def parent_cls(self) -> type:
-            return vllm_linear_cls
-
-        def forward(self, input: torch.Tensor) -> torch.Tensor:
-            return super().forward(input)[0]
-
-    return HFCompatibleLinear(
+    return vllm_linear_cls(
         input_size=linear.in_features,
         output_size=linear.out_features,
         bias=linear.bias is not None,
         quant_config=quant_config,
+        return_bias=False,
     )
 
 
@@ -128,10 +116,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
 
         self.config = config
-        self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+        self.vocab_size = model_config.get_vocab_size()
+        self.unpadded_vocab_size = model_config.get_vocab_size()
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
@@ -145,15 +135,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.apply_base_model_tp_plan(self.model)
 
         # Attention modifications (assumes 1 attention op per hidden layer)
-        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = model_config.get_num_attention_heads(parallel_config)
+        head_size = model_config.get_head_size()
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.attention_instances = [
             Attention(
-                num_heads=divide(config.num_attention_heads, tp_size),
-                head_size=config.head_dim,
+                num_heads=num_heads,
+                head_size=head_size,
                 # NOTE: We use Llama scale as default, if it's set by
                 # Transformers, it's updated in vllm_flash_attention_forward
-                scale=config.head_dim**-0.5,
-                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                scale=head_size**-0.5,
+                num_kv_heads=num_kv_heads,
                 cache_config=cache_config,
                 quant_config=self.quant_config,
                 prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
@@ -163,7 +155,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.replace_vocab_embed_class(self.model)
 
         # ForCausalLM modifications
-        self.lm_head = ParallelLMHead(config.vocab_size,
+        self.lm_head = ParallelLMHead(self.vocab_size,
                                       config.hidden_size,
                                       quant_config=self.quant_config,
                                       prefix=maybe_prefix(prefix, "lm_head"))
@@ -172,7 +164,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+                                                self.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
     def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
@@ -203,12 +195,12 @@ def replace_vocab_embed_class(self, module: nn.Module):
         new_module = VocabParallelEmbedding(
             self.vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
+            org_num_embeddings=self.vocab_size,
             quant_config=None,
         )
         log_replacement("input embedding", self.model.get_input_embeddings(),
                         new_module)
-        self.model.set_input_embeddings(new_module)
+        module.set_input_embeddings(new_module)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d47f924ea19e..90a833a83b66 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -476,7 +476,9 @@ def _process_audio_input(
 
         return result
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index f9aa5da39a5f..a705aeffef35 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -650,4 +650,4 @@ def cast_overflow_tensors(
     if tensors.isinf().any() or tensors.isnan().any():
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
+    return tensors
\ No newline at end of file
diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py
new file mode 100644
index 000000000000..3a9597a845ff
--- /dev/null
+++ b/vllm/model_executor/models/vision_siglip_navit.py
@@ -0,0 +1,1966 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Siglip model configuration"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (ModelOutput, add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+
+from vllm.platforms import _Backend
+
+from .vision import get_vit_attn_backend
+
+logger = logging.get_logger(__name__)
+
+SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/siglip-base-patch16-224":
+    "https://huggingface.co/google/siglip-base-patch16-224/"\
+        "resolve/main/config.json",
+}
+
+
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a 
+    [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder
+    according to the specified arguments, defining the model architecture. 
+    Instantiating a configuration with the defaults will yield a similar 
+    configuration to that of the text encoder of the Siglip [google/
+    siglip-base-patch16-224](https://huggingface.co/google/siglip-base
+    -patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from 
+    [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of 
+            different tokens that can be represented by the `inputs_ids` 
+            passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer 
+            in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the 
+            Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used 
+            with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to 
+            `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+    Example:
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 
+        style configuration
+    >>> configuration = SiglipTextConfig()
+    >>> # Initializing a SiglipTextModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#
+        # issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id,
+                         bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.", config_dict['model_type'],
+                cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a 
+    [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the 
+    model architecture. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/
+    siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used 
+    to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer
+            in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to 
+            `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the 
+            encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and
+            `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224
+        style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to "
+                "instantiate a model of type %s. This is not"
+                " supported for all configurations of models and can yield"
+                " errors.", config_dict['model_type'], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a
+    [`SiglipModel`]. It is used to instantiate a Siglip model according to the 
+    specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Siglip [google/siglip-base-patch16-224](
+    https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from 
+    [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize 
+            [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize 
+            [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 
+        style configuration
+    >>> configuration = SiglipConfig()
+    >>> # Initializing a SiglipModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig 
+        and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, 
+        config_vision)
+    ```"""
+
+    model_type = "siglip"
+
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                "`text_config` is `None`. Initializing the `SiglipTextConfig`"
+                " with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the "
+                        "`SiglipVisionConfig` with default values.")
+
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig,
+                                 vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text 
+        model configuration and siglip vision
+        model configuration.
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(),
+                   vision_config=vision_config.to_dict(),
+                   **kwargs)
+
+
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model."""
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official
+    # releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/
+    # truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)  # noqa
+    u = norm_cdf((b - mean) / std)  # noqa
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(tensor: torch.Tensor,
+                     mean: float = 0.0,
+                     std: float = 1.0,
+                     a: float = -2.0,
+                     b: float = 2.0) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where 
+    the bounds [a, b] are applied when sampling the normal distribution with
+    mean=0, std=1.0 and the result is subsequently scaled and shifted by the
+    mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with
+# CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings
+    of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+            *optional* returned when model is initialized with 
+            `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to
+            the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
+            sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the 
+            model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_hidden_states=True` is passed or when 
+            `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, 
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the 
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_attentions=True` is passed or when 
+            `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape 
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the 
+            weighted average in the self-attention heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with
+# CLIP->Siglip
+class SiglipTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the 
+    last hidden states.
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+             *optional* returned when model is initialized with 
+             `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to
+            model.
+            the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
+            sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_hidden_states=True` is passed or when 
+            `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the 
+            embeddings, if the model has an embedding layer, + one for the 
+            output of each layer) of shape `(batch_size, sequence_length, 
+            hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_attentions=True` is passed or when 
+            `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape 
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute
+            the weighted average in the self-attention heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with
+# CLIP->Siglip
+class SiglipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+          `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size,
+          text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and 
+            `text_embeds`. This represents the image-text similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, 
+            image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and 
+            `image_embeds`. This represents the text-image similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to 
+            the pooled output of [`SiglipTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to
+            the pooled output of [`SiglipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"
+                                 ] else getattr(self, k).to_tuple()
+            for k in self.keys())
+
+
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \
+            max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h,
+                                                 nb_patches_h)
+            fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w,
+                                                 nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with
+# CLIP->Siglip
+class SiglipTextEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings,
+                                               embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[
+            -1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`:"
+                f" {self.embed_dim} and `num_heads`: {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(
+            2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len,
+                                   k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size "
+                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(f"Attention mask should be of size "
+                                 f"{(batch_size, 1, q_len, k_v_seq_len)}, "
+                                 f"but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(
+                                                 query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.dropout,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len,
+                                  self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size "
+                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as
+    the weights of the module stays untouched. The only required change would
+    be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any
+    of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(
+                kv_seq_len, self.layer_idx)
+
+        # TODO: These transpose are quite inefficient but Flash Attention
+        #  requires the layout [batch_size, sequence_length, num_heads,
+        #  head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training
+        # stability reasons therefore the input hidden states gets silently
+        # casted in float32. Hence, we need cast them back in the correct
+        # dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to
+        # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in "
+                "float32, this might be related to the fact you have upcasted "
+                "embedding or layer norm layers in float32. We will cast "
+                f"back the input in {target_dtype}.")
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(query_states,
+                                                    key_states,
+                                                    value_states,
+                                                    attention_mask,
+                                                    q_len,
+                                                    dropout=dropout_rate)
+
+        attn_output = attn_output.reshape(bsz, q_len,
+                                          self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(self,
+                                 query_states,
+                                 key_states,
+                                 value_states,
+                                 attention_mask,
+                                 query_length,
+                                 dropout=0.0,
+                                 softmax_scale=None):
+        """
+        Calls the forward method of Flash Attention - if the input hidden 
+        states contain at least one padding token first unpad the input, 
+        then computes the attention scores and pad the final attention 
+        scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size 
+                `(batch_size, seq_len)` where 0 stands for the position 
+                of padding tokens and 1 for the position of non-padding 
+                tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / 
+                sqrt(head_dim)
+        """
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import pad_input  # noqa
+
+        # TODO: Remove the `query_length != 1` check once Flash Attention for
+        # RoCm is bumped to 2.1. For details, please see the comment in
+        # LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, \
+                max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask,
+                query_length)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
+                                    query_length)
+        else:
+            attn_output = flash_attn_func(query_states,
+                                          key_states,
+                                          value_states,
+                                          dropout,
+                                          softmax_scale=softmax_scale,
+                                          causal=causal)
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
+                    query_length):
+        from flash_attn.bert_padding import index_first_axis, unpad_input
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                              head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                                head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+                                    head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \
+                unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with
+# CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (SiglipAttention(config) if
+                          not getattr(config, "_flash_attn_2_enabled", False)
+                          else SiglipFlashAttention2(config))
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where
+                padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under returned tensors for
+                more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states, )
+
+        if output_attentions:
+            outputs += (attn_weights, )
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface 
+    for downloading and loading pretrained models.
+    """
+
+    config_class = SiglipConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = (self.config.vision_config.hidden_size if isinstance(
+                self.config, SiglipConfig) else self.config.hidden_size)
+            nn.init.normal_(module.position_embedding.weight,
+                            std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, SiglipModel):
+            logit_scale_init = torch.tensor(0.0)
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass 
+    documentation for the generic methods the library implements for all 
+    its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/
+    stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation
+    for all matter related to general usage and behavior.
+    Parameters:
+        config ([`SiglipConfig`]): Model configuration class with all the 
+            parameters of the model.
+            Initializing with a config file does not load the weights 
+            associated with the model, only the configuration. Check out 
+            the [`~PreTrainedModel.from_pretrained`] method to load the 
+            model weights.
+"""
+
+SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)
+        `):
+            Indices of input sequence tokens in the vocabulary. Padding will 
+            be ignored by default should you provide it.
+            Indices can be obtained using [`AutoTokenizer`]. See 
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0, 
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention 
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size,
+             num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you 
+            provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+SIGLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, 
+        sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding 
+            will be ignored by default should you provide it.
+            Indices can be obtained using [`AutoTokenizer`]. See 
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`
+            , *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0, 
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, 
+            num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you 
+            provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] 
+            for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention 
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with
+# CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` 
+    self attention layers. Each layer is a [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, 
+                sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to 
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert 
+                `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, 
+                sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. 
+                Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under returned tensors for
+                  more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See 
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a
+                  plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states,
+                               hidden_states=encoder_states,
+                               attentions=all_attentions)
+
+
+class SiglipTextTransformer(nn.Module):
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipTextEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.head = nn.Linear(embed_dim, embed_dim)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states \
+                                    is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids,
+                                        position_ids=position_ids)
+
+        # note: SigLIP's text model does not use a causal mask, unlike the
+        # original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] ->
+            # [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipTextModel(SiglipPreTrainedModel):
+    config_class = SiglipTextConfig
+
+    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__(config)
+        self.text_model = SiglipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, SiglipTextModel
+        >>> model = SiglipTextModel.
+            from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.
+            from_pretrained("google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" 
+            as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
+            padding="max_length", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) 
+            states
+        ```"""
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None\
+              else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending
+        # to the whole sequence), avoiding passing the attention_mask, which
+        # is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (_prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype)
+                              if not self.config._flash_attn_2_enabled else
+                              patch_attention_mask)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(query=probe,
+                                      key=hidden_state,
+                                      value=hidden_state,
+                                      key_padding_mask=~attention_mask)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SiglipVisionModel
+        >>> model = SiglipVisionModel.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = 
+            "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(SIGLIP_START_DOCSTRING)
+class SiglipModel(SiglipPreTrainedModel):
+    config_class = SiglipConfig
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, SiglipTextConfig):
+            raise ValueError("config.text_config is expected to be of type "
+                             f"SiglipTextConfig but is of type"
+                             f" {type(config.text_config)}.")
+
+        if not isinstance(config.vision_config, SiglipVisionConfig):
+            raise ValueError("config.vision_config is expected to be of type "
+                             "SiglipVisionConfig but is of type"
+                             f" {type(config.vision_config)}.")
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_model = SiglipTextTransformer(text_config)
+        self.vision_model = SiglipVisionTransformer(vision_config)
+
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size,
+              output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output
+              of [`SiglipTextModel`].
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" as that's 
+            how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
+            padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead
+        # of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None\
+              else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, 
+            output_dim`): The image embeddings obtained by applying the
+            projection layer to the pooled output of [`SiglipVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use SiglipModel's config for some fields (if specified) instead
+        # of those of vision & text components.
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SiglipOutput,
+                               config_class=SiglipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SiglipOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was 
+            trained with this
+        >>> inputs = processor(text=texts, images=image, 
+            padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the 
+            probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of
+        # those of vision & text components.
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(
+            p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t(
+        )) * self.logit_scale.exp() + self.logit_bias
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            raise NotImplementedError("SigLIP loss to be implemented")
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds,
+                      image_embeds, text_outputs, vision_outputs)
+            return ((loss, ) + output) if loss is not None else output
+
+        return SiglipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs):
+    siglip_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 448,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+
+    # Detect attention implementation.
+    attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+    if attn_backend != _Backend.FLASH_ATTN:
+        _flash_attn_2_enabled = False
+
+    model_config = SiglipVisionConfig(
+        **siglip_vision_config,
+        _flash_attn_2_enabled=_flash_attn_2_enabled,
+        **kwargs)
+
+    vision_model = SiglipVisionModel(model_config).vision_model
+
+    return vision_model
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c5a55e300c46..8ed68bd89e5a 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -49,10 +49,7 @@ class WhisperAudioInputs(TypedDict):
 
 class WhisperPositionalEmbedding(nn.Embedding):
 
-    def __init__(self,
-                 num_positions: int,
-                 embedding_dim: int,
-                 padding_idx: Optional[int] = None):
+    def __init__(self, num_positions: int, embedding_dim: int):
         super().__init__(num_positions, embedding_dim)
 
     def forward(self, position_ids):
@@ -359,7 +356,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
         self.num_mel_bins = config.num_mel_bins
-        self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_source_positions
         self.embed_scale = (math.sqrt(embed_dim)
                             if config.scale_embedding else 1.0)
@@ -692,7 +688,9 @@ def forward(
         )
         return decoder_outputs
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
         # Refactor this once encoder/decoder support is implemented in V1.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c48d07ba365b..e0b160a65047 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,9 +2,10 @@
 
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from collections.abc import Sequence
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
-                    Optional, Sequence, Tuple, Type, TypeVar, Union)
+                    Optional, TypeVar, Union)
 
 from torch import nn
 
@@ -39,7 +40,7 @@
 """
 
 _T = TypeVar("_T")
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 
 
 class MultiModalPlugin(ABC):
@@ -274,7 +275,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
+    ) -> tuple[Optional[MultiModalDataDict], dict[str,
                                                   "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 7d277fd67dec..11665ef66753 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
-from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+from collections.abc import Iterable, Mapping
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 98ece8f806f1..255fac30bd78 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,7 +3,7 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from PIL import Image
@@ -31,7 +31,7 @@ def get_data_key(self) -> str:
     def _get_hf_image_processor(
         self,
         model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -134,3 +134,22 @@ def encode_base64(
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode('utf-8')
+
+
+class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return base64.b64encode(media.numpy()).decode('utf-8')
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7232df074f84..ba8a458e84c8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -14,7 +14,6 @@
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
-import vllm.envs as envs
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
@@ -985,10 +984,10 @@ def _to_mm_items(
         before passing them to :meth:`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
+        mm_config = self.info.ctx.get_mm_config()
 
-        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
-            limit = mm_limits.get(modality, 1)
+            limit = mm_config.get_limit_per_prompt(modality)
             if len(items) > limit:
                 raise ValueError(
                     f"You set {modality}={limit} (or defaulted to 1) in "
@@ -1435,6 +1434,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1451,11 +1451,11 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
-        # Create MM hashes (only used in V1)
+        # Create MM hashes to be returned (only used in V1)
         # TODO: Use these hash keys for caching operations in apply_hf_processor
         # instead of rehashing.
 
-        if envs.VLLM_USE_V1:
+        if return_mm_hashes:
             model_id = self.info.model_id
             mm_hashes = {
                 modality: [
@@ -1554,6 +1554,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1567,6 +1568,7 @@ def apply(
             encoder_prompt,
             mm_data,
             hf_processor_mm_kwargs,
+            return_mm_hashes,
         )
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 3178b0f8c3e6..b791fb83478f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, TypeVar
+from typing import Generic, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -13,7 +13,8 @@
 from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict, MultiModalInputs
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalInputs)
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -110,12 +111,10 @@ def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
 
     def get_mm_limits(self) -> Mapping[str, int]:
         mm_config = self.processing_info.ctx.get_mm_config()
-        mm_limit_per_prompt = mm_config.limit_per_prompt
-
         supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
-            modality: mm_limit_per_prompt.get(modality, 1)
+            modality: mm_config.get_limit_per_prompt(modality)
             for modality in supported_mm_limits
         }
 
@@ -144,14 +143,10 @@ def _get_dummy_mm_inputs(
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
 
-    def get_dummy_data(
+    def get_and_validate_mm_inputs(
         self,
         seq_len: int,
-        is_encoder_data: bool = False,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
+    ) -> tuple[MultiModalInputs, Mapping[str, int]]:
         mm_counts = self.get_mm_limits()
 
         info = self.processing_info
@@ -167,11 +162,6 @@ def get_dummy_data(
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         placeholders_by_modality = mm_inputs["mm_placeholders"]
-        # For encoder-decoder models, use encoder prompt token ids instead of
-        # decoder prompt to construct dummy seq_data for encoder profiling.
-        prompt_token_ids = (
-            mm_inputs["prompt_token_ids"] if not is_encoder_data else
-            mm_inputs["encoder_prompt_token_ids"])  # type: ignore
 
         total_placeholders_by_modality = {
             modality: sum(item["length"] for item in placeholders)
@@ -187,28 +177,60 @@ def get_dummy_data(
                 f"{total_placeholders_by_modality} placeholder tokens, which "
                 f"is not the expected {expected_placeholders_by_modality} "
                 "tokens.")
+        return mm_inputs, total_placeholders_by_modality
+
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
+        mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
+
+        # For encoder-decoder models, use encoder prompt token ids instead of
+        # decoder prompt to construct dummy seq_data for encoder profiling.
+        encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
+
+        total_len = len(encoder_prompt_token_ids)
+        num_tokens_to_pad = max(total_len, seq_len) - total_len
+        encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
+            multi_modal_data=None,
+            multi_modal_placeholders=None,
+        )
+
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        (mm_inputs, total_placeholders_by_modality
+         ) = self.get_and_validate_mm_inputs(seq_len)
 
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
 
         # V0 does not support chunked prefill.
-        if (total_len > seq_len and not envs.VLLM_USE_V1) or is_encoder_data:
-            if total_len > seq_len and not is_encoder_data:
-                logger.warning(
-                    "The context length (%d) of the model is too short "
-                    "to hold the multi-modal embeddings in the worst case "
-                    "(%d tokens in total, out of which %s are reserved for "
-                    "multi-modal embeddings). This may cause certain "
-                    "multi-modal inputs to fail during inference, even when "
-                    "the input text is short. To avoid this, you should "
-                    "increase `max_model_len`, reduce `max_num_seqs`, "
-                    "and/or reduce `mm_counts`.", seq_len, total_len,
-                    total_placeholders_by_modality)
-
-            num_tokens_to_pad = max(total_len, seq_len) - total_len
-            prompt_token_ids.extend([0] * num_tokens_to_pad)
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.", seq_len, total_len,
+                total_placeholders_by_modality)
 
             return DummyData(
-                seq_data=SequenceData.from_seqs(prompt_token_ids),
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )
@@ -218,5 +240,5 @@ def get_dummy_data(
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
             multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
+            multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 1882ffe9bf69..febf3ad9eea4 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,9 +2,9 @@
 
 import functools
 from collections import UserDict
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
-                    Protocol, Sequence, Type, TypeVar)
+from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
 
@@ -29,7 +29,7 @@
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
@@ -83,13 +83,13 @@ def build_processor(
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
+class _MultiModalLimits(UserDict["ModelConfig", dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
     when attempting to access a model that does not exist.
     """
 
-    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
+    def __getitem__(self, key: "ModelConfig") -> dict[str, int]:
         try:
             return super().__getitem__(key)
         except KeyError as exc:
@@ -170,7 +170,7 @@ def map_input(
         self,
         model_config: "ModelConfig",
         data: MultiModalDataDict,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ) -> MultiModalKwargs:
         """
         Apply an input mapper to the data passed to the model.
@@ -184,7 +184,7 @@ def map_input(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
-        merged_dict: Dict[str, NestedTensors] = {}
+        merged_dict = dict[str, NestedTensors]()
 
         for data_key, data_value in data.items():
             plugin = self._get_plugin(data_key)
@@ -257,7 +257,9 @@ def get_max_tokens_per_item_by_modality(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
             return processor.info.get_mm_max_tokens_per_item(
@@ -353,7 +355,7 @@ def init_mm_limits_per_prompt(
             # TODO: Automatically determine the limits based on budget
             # once more models support multi-image inputs
             limits_per_plugin = {
-                key: config_limits_per_plugin.get(key, 1)
+                key: multimodal_config.get_limit_per_prompt(key)
                 for key in self._plugins
             }
 
@@ -372,7 +374,9 @@ def get_mm_limits_per_prompt(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -433,6 +437,8 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
+        *,
+        disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -440,11 +446,13 @@ def create_processor(
         See also:
             :ref:`mm-processing`
         """
+        if disable_cache is None:
+            disable_cache = model_config.disable_mm_preprocessor_cache
+
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        cache = (None if model_config.disable_mm_preprocessor_cache else
-                 self._processing_cache)
+        cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 6e6c10b34a25..ad381e1d1d00 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import numpy.typing as npt
+import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -16,7 +17,7 @@
 
 from .audio import AudioMediaIO
 from .base import MediaIO
-from .image import ImageMediaIO
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
@@ -245,6 +246,17 @@ async def fetch_video_async(
             fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
         )
 
+    def fetch_image_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load image embedding from a URL.
+        """
+        image_embedding_io = ImageEmbeddingMediaIO()
+
+        return image_embedding_io.load_base64("", data)
+
 
 global_media_connector = MediaConnector()
 """The global :class:`MediaConnector` instance used by vLLM."""
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 8004377191b3..0b3d3f8c79d7 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -4,7 +4,7 @@
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import numpy.typing as npt
@@ -39,7 +39,7 @@ def get_data_key(self) -> str:
     def _get_hf_video_processor(
         self,
         model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 48cf8f7a323a..4912906fef46 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -32,6 +32,7 @@ def vllm_version_matches_substr(substr: str) -> bool:
 
 def tpu_platform_plugin() -> Optional[str]:
     is_tpu = False
+    logger.debug("Checking if TPU platform is available.")
     try:
         # While it's technically possible to install libtpu on a
         # non-TPU machine, this is a very uncommon scenario. Therefore,
@@ -39,7 +40,9 @@ def tpu_platform_plugin() -> Optional[str]:
         # has TPUs.
         import libtpu  # noqa: F401
         is_tpu = True
-    except Exception:
+        logger.debug("Confirmed TPU platform is available.")
+    except Exception as e:
+        logger.debug("TPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
@@ -47,7 +50,7 @@ def tpu_platform_plugin() -> Optional[str]:
 
 def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
-
+    logger.debug("Checking if CUDA platform is available.")
     try:
         from vllm.utils import import_pynvml
         pynvml = import_pynvml()
@@ -60,9 +63,19 @@ def cuda_platform_plugin() -> Optional[str]:
             # on a GPU machine, even if in a cpu build.
             is_cuda = (pynvml.nvmlDeviceGetCount() > 0
                        and not vllm_version_matches_substr("cpu"))
+            if pynvml.nvmlDeviceGetCount() <= 0:
+                logger.debug(
+                    "CUDA platform is not available because no GPU is found.")
+            if vllm_version_matches_substr("cpu"):
+                logger.debug("CUDA platform is not available because"
+                             " vLLM is built with CPU.")
+            if is_cuda:
+                logger.debug("Confirmed CUDA platform is available.")
         finally:
             pynvml.nvmlShutdown()
     except Exception as e:
+        logger.debug("Exception happens when checking CUDA platform: %s",
+                     str(e))
         if "nvml" not in e.__class__.__name__.lower():
             # If the error is not related to NVML, re-raise it.
             raise e
@@ -75,23 +88,31 @@ def cuda_is_jetson() -> bool:
                 or os.path.exists("/sys/class/tegra-firmware")
 
         if cuda_is_jetson():
+            logger.debug("Confirmed CUDA platform is available on Jetson.")
             is_cuda = True
+        else:
+            logger.debug("CUDA platform is not available because: %s", str(e))
 
     return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
 
 
 def rocm_platform_plugin() -> Optional[str]:
     is_rocm = False
-
+    logger.debug("Checking if ROCm platform is available.")
     try:
         import amdsmi
         amdsmi.amdsmi_init()
         try:
             if len(amdsmi.amdsmi_get_processor_handles()) > 0:
                 is_rocm = True
+                logger.debug("Confirmed ROCm platform is available.")
+            else:
+                logger.debug("ROCm platform is not available because"
+                             " no GPU is found.")
         finally:
             amdsmi.amdsmi_shut_down()
-    except Exception:
+    except Exception as e:
+        logger.debug("ROCm platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
@@ -99,10 +120,17 @@ def rocm_platform_plugin() -> Optional[str]:
 
 def hpu_platform_plugin() -> Optional[str]:
     is_hpu = False
+    logger.debug("Checking if HPU platform is available.")
     try:
         from importlib import util
         is_hpu = util.find_spec('habana_frameworks') is not None
-    except Exception:
+        if is_hpu:
+            logger.debug("Confirmed HPU platform is available.")
+        else:
+            logger.debug("HPU platform is not available because "
+                         "habana_frameworks is not found.")
+    except Exception as e:
+        logger.debug("HPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
@@ -110,7 +138,7 @@ def hpu_platform_plugin() -> Optional[str]:
 
 def xpu_platform_plugin() -> Optional[str]:
     is_xpu = False
-
+    logger.debug("Checking if XPU platform is available.")
     try:
         # installed IPEX if the machine has XPUs.
         import intel_extension_for_pytorch  # noqa: F401
@@ -118,7 +146,9 @@ def xpu_platform_plugin() -> Optional[str]:
         import torch
         if hasattr(torch, 'xpu') and torch.xpu.is_available():
             is_xpu = True
-    except Exception:
+            logger.debug("Confirmed XPU platform is available.")
+    except Exception as e:
+        logger.debug("XPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
@@ -126,13 +156,21 @@ def xpu_platform_plugin() -> Optional[str]:
 
 def cpu_platform_plugin() -> Optional[str]:
     is_cpu = False
+    logger.debug("Checking if CPU platform is available.")
     try:
         is_cpu = vllm_version_matches_substr("cpu")
+        if is_cpu:
+            logger.debug("Confirmed CPU platform is available because"
+                         " vLLM is built with CPU.")
         if not is_cpu:
-            import platform
-            is_cpu = platform.machine().lower().startswith("arm")
+            import sys
+            is_cpu = sys.platform.startswith("darwin")
+            if is_cpu:
+                logger.debug("Confirmed CPU platform is available"
+                             " because the machine is MacOS.")
 
-    except Exception:
+    except Exception as e:
+        logger.debug("CPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
@@ -140,10 +178,14 @@ def cpu_platform_plugin() -> Optional[str]:
 
 def neuron_platform_plugin() -> Optional[str]:
     is_neuron = False
+    logger.debug("Checking if Neuron platform is available.")
     try:
         import transformers_neuronx  # noqa: F401
         is_neuron = True
-    except ImportError:
+        logger.debug("Confirmed Neuron platform is available because"
+                     " transformers_neuronx is found.")
+    except ImportError as e:
+        logger.debug("Neuron platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
@@ -151,8 +193,15 @@ def neuron_platform_plugin() -> Optional[str]:
 
 def openvino_platform_plugin() -> Optional[str]:
     is_openvino = False
+    logger.debug("Checking if OpenVINO platform is available.")
     with suppress(Exception):
         is_openvino = vllm_version_matches_substr("openvino")
+        if is_openvino:
+            logger.debug("Confirmed OpenVINO platform is available"
+                         " because vLLM is built with OpenVINO.")
+    if not is_openvino:
+        logger.debug("OpenVINO platform is not available because"
+                     " vLLM is not built with OpenVINO.")
 
     return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bffa113cab89..1bba99088bb2 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,7 +4,7 @@
 """
 
 import os
-from functools import lru_cache, wraps
+from functools import wraps
 from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
                     Union)
 
@@ -111,6 +111,8 @@ def log_warnings(cls):
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
+        compilation_config = vllm_config.compilation_config
+        model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
@@ -141,14 +143,29 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
+
         # TODO(lucas): handle this more gracefully
-        if envs.VLLM_ATTENTION_BACKEND is not None \
-           and envs.VLLM_ATTENTION_BACKEND == "FLASHMLA" \
-           and cache_config.block_size != 64:
-            cache_config.block_size = 64
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.backends.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+
+        if (parallel_config.data_parallel_size > 1
+                and compilation_config.use_cudagraph):
             logger.info(
-                "FlashMLA: Forcing kv cache block size to 64 since this"
-                " is currently the only block size supported by the kernel.")
+                "Data Parallel: Forcing enforce eager to be True since DP is "
+                "currently not supported with CUDA Graphs.")
+            vllm_config.model_config.enforce_eager = True
+            compilation_config.use_cudagraph = False
 
     @classmethod
     def get_current_memory_usage(cls,
@@ -164,7 +181,15 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_mla:
             # TODO(lucas): refactor to  be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.FLASHMLA:
+            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+                if use_v1:
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "triton_mla.TritonMLABackend")
+                else:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            else:
                 from vllm.attention.backends.flashmla import (
                     is_flashmla_supported)
                 if not is_flashmla_supported()[0]:
@@ -186,14 +211,6 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         logger.info("Using FlashMLA backend.")
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
-
-            if use_v1:
-                logger.info_once("Using Triton MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "triton_mla.TritonMLABackend")
-            else:
-                logger.info("Using Triton MLA backend.")
-                return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if use_v1:
             logger.info_once("Using Flash Attention backend on V1 engine.")
             return ("vllm.v1.attention.backends.flash_attn."
@@ -284,7 +301,6 @@ def get_device_communicator_cls(cls) -> str:
 class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_capability(cls,
                               device_id: int = 0
@@ -298,7 +314,6 @@ def get_device_capability(cls,
             return None
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def has_device_capability(
         cls,
@@ -311,14 +326,12 @@ def has_device_capability(
             return False
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
         return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_uuid(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
@@ -326,7 +339,6 @@ def get_device_uuid(cls, device_id: int = 0) -> str:
         return pynvml.nvmlDeviceGetUUID(handle)
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         physical_device_id = device_id_to_physical_device_id(device_id)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index e740df6713a7..0be4910e55ea 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -11,6 +11,8 @@
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
@@ -120,7 +122,7 @@ class SamplingParams(
         best_of: Number of output sequences that are generated from the prompt.
             From these `best_of` sequences, the top `n` sequences are returned.
             `best_of` must be greater than or equal to `n`. By default,
-            `best_of` is set to `n`.
+            `best_of` is set to `n`. Warning, this is only supported in V0.
         presence_penalty: Float that penalizes new tokens based on whether they
             appear in the generated text so far. Values > 0 encourage the model
             to use new tokens, while values < 0 encourage the model to repeat
@@ -187,6 +189,9 @@ class SamplingParams(
         allowed_token_ids: If provided, the engine will construct a logits
             processor which only retains scores for the given token ids.
             Defaults to None.
+        extra_args: Arbitrary additional args, that can be used by custom
+            sampling implementations. Not used by any in-tree sampling
+            implementations.
     """
 
     n: int = 1
@@ -205,7 +210,6 @@ class SamplingParams(
     seed: Optional[int] = None
     stop: Optional[Union[str, list[str]]] = None
     stop_token_ids: Optional[list[int]] = None
-    bad_words: Optional[list[str]] = None
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
     min_tokens: int = 0
@@ -233,6 +237,11 @@ class SamplingParams(
     guided_decoding: Optional[GuidedDecodingParams] = None
     logit_bias: Optional[dict[int, float]] = None
     allowed_token_ids: Optional[list[int]] = None
+    extra_args: Optional[dict[str, Any]] = None
+
+    # Fields used for bad words
+    bad_words: Optional[list[str]] = None
+    _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list)
 
     @staticmethod
     def from_optional(
@@ -268,6 +277,7 @@ def from_optional(
         guided_decoding: Optional[GuidedDecodingParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
         allowed_token_ids: Optional[list[int]] = None,
+        extra_args: Optional[dict[str, Any]] = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -312,6 +322,7 @@ def from_optional(
             guided_decoding=guided_decoding,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
+            extra_args=extra_args,
         )
 
     def __post_init__(self) -> None:
@@ -470,6 +481,46 @@ def update_from_generation_config(
                     eos_ids.update(self.stop_token_ids)
                     self.stop_token_ids = list(eos_ids)
 
+    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+        if self.bad_words is None:
+            return
+        for bad_word in self.bad_words:
+            # To prohibit words both at the beginning
+            # and in the middle of text
+            # (related to add_prefix_space tokenizer parameter)
+            for add_prefix_space in [False, True]:
+                prefix = " " if add_prefix_space else ""
+                prompt = prefix + bad_word.lstrip()
+
+                if isinstance(tokenizer, MistralTokenizer):
+                    # Mistral tokenizers should not add special tokens
+                    prompt_token_ids = tokenizer.encode(text=prompt)
+                else:
+                    prompt_token_ids = tokenizer.encode(
+                        text=prompt, add_special_tokens=False)
+
+                # If no space at the beginning
+                # or if prefix space produces a new word token
+                if (not add_prefix_space) or (
+                        add_prefix_space and prompt_token_ids[0]
+                        != self._bad_words_token_ids[-1][0]
+                        and len(prompt_token_ids) == len(
+                            self._bad_words_token_ids[-1])):
+                    self._bad_words_token_ids.append(prompt_token_ids)
+
+        invalid_token_ids = [
+            token_id for bad_words_token_ids in self._bad_words_token_ids
+            for token_id in bad_words_token_ids
+            if token_id < 0 or token_id > tokenizer.max_token_id
+        ]
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {tokenizer.max_token_id+1},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id <= {tokenizer.max_token_id}.")
+
     @cached_property
     def sampling_type(self) -> SamplingType:
         if self.ppl_measurement:
@@ -484,6 +535,11 @@ def sampling_type(self) -> SamplingType:
     def all_stop_token_ids(self) -> set[int]:
         return self._all_stop_token_ids
 
+    @property
+    def bad_words_token_ids(self) -> list[list[int]]:
+        # For internal use only. Backward compatibility not guaranteed
+        return self._bad_words_token_ids
+
     def clone(self) -> "SamplingParams":
         """Deep copy, but maybe not the LogitsProcessor objects.
 
@@ -524,7 +580,8 @@ def __repr__(self) -> str:
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
             f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"guided_decoding={self.guided_decoding})")
+            f"guided_decoding={self.guided_decoding}, "
+            f"extra_args={self.extra_args})")
 
 
 class BeamSearchParams(
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index bc1b3e2319d0..3ad9b4993327 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -133,7 +133,7 @@ def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
     def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
         """Determines if draft_model_runner GPU multi-step can be used.
         Currently required conditions are:
-            1. Only decodes 
+            1. Only decodes
             2. Only flash-attn
             3. No LORA
             4. No prompt_adapter_config
@@ -171,12 +171,12 @@ def execute_model(
         num_steps: int = 1,
         **kwargs,
     ) -> Optional[List[SamplerOutput]]:
-        """Executes num_steps forward passes with advacement of input tensors 
+        """Executes num_steps forward passes with advacement of input tensors
         on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
 
         Optimizations used:
             1. Input tensors are updated on the GPU directly
-            2. Skips GPU=>CPU serialization of sampler outputs (we don't need 
+            2. Skips GPU=>CPU serialization of sampler outputs (we don't need
                 them since we do batch expansion later that uses GPU outputs)
             3. Reuses sampling tensors (since we run only decodes and they have
                 a repeating sampling logic)
@@ -302,7 +302,12 @@ def execute_model(
             outputs.append(output)
 
             if self.return_hidden_states and is_fallback:
-                output.hidden_states = hidden_states
+                if use_cuda_graph:
+                    indices = model_input.sampling_metadata\
+                      .selected_token_indices
+                    output.hidden_states = hidden_states[:len(indices)]
+                else:
+                    output.hidden_states = hidden_states
 
             if model_input.attn_metadata.num_prefills == 0 \
                 and self.indices_of_seq_with_bonus_tokens is not None:
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index fbbb21c89370..2ee3f9104d19 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -9,7 +9,7 @@
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Dict, Optional, Union
+from typing import Any, Optional, Union
 from uuid import uuid4
 
 import cpuinfo
@@ -27,7 +27,7 @@
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
-_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
+_GLOBAL_RUNTIME_DATA = dict[str, Union[str, int, bool]]()
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
@@ -150,7 +150,7 @@ def __init__(self) -> None:
     def report_usage(self,
                      model_architecture: str,
                      usage_context: UsageContext,
-                     extra_kvs: Optional[Dict[str, Any]] = None) -> None:
+                     extra_kvs: Optional[dict[str, Any]] = None) -> None:
         t = Thread(target=self._report_usage_worker,
                    args=(model_architecture, usage_context, extra_kvs or {}),
                    daemon=True)
@@ -158,13 +158,13 @@ def report_usage(self,
 
     def _report_usage_worker(self, model_architecture: str,
                              usage_context: UsageContext,
-                             extra_kvs: Dict[str, Any]) -> None:
+                             extra_kvs: dict[str, Any]) -> None:
         self._report_usage_once(model_architecture, usage_context, extra_kvs)
         self._report_continous_usage()
 
     def _report_usage_once(self, model_architecture: str,
                            usage_context: UsageContext,
-                           extra_kvs: Dict[str, Any]) -> None:
+                           extra_kvs: dict[str, Any]) -> None:
         # Platform information
         from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
@@ -227,7 +227,7 @@ def _report_continous_usage(self):
             self._write_to_file(data)
             self._send_to_server(data)
 
-    def _send_to_server(self, data: Dict[str, Any]) -> None:
+    def _send_to_server(self, data: dict[str, Any]) -> None:
         try:
             global_http_client = global_http_connection.get_sync_client()
             global_http_client.post(_USAGE_STATS_SERVER, json=data)
@@ -235,7 +235,7 @@ def _send_to_server(self, data: Dict[str, Any]) -> None:
             # silently ignore unless we are using debug log
             logging.debug("Failed to send usage data to server")
 
-    def _write_to_file(self, data: Dict[str, Any]) -> None:
+    def _write_to_file(self, data: dict[str, Any]) -> None:
         os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
         Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
         with open(_USAGE_STATS_JSON_PATH, "a") as f:
diff --git a/vllm/utils.py b/vllm/utils.py
index 7b849c4594f2..4e317a8e4abd 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import argparse
 import asyncio
 import concurrent
@@ -8,6 +10,7 @@
 import enum
 import gc
 import getpass
+import importlib
 import importlib.metadata
 import importlib.util
 import inspect
@@ -23,6 +26,7 @@
 import threading
 import time
 import traceback
+import types
 import uuid
 import warnings
 import weakref
@@ -649,6 +653,10 @@ def get_open_zmq_ipc_path() -> str:
     return f"ipc://{base_rpc_path}/{uuid4()}"
 
 
+def get_open_zmq_inproc_path() -> str:
+    return f"inproc://{uuid4()}"
+
+
 def get_open_port() -> int:
     """
     Get an open port for the vLLM process to listen on.
@@ -789,7 +797,7 @@ def create_kv_caches_with_random_flash(
     head_size: int,
     cache_dtype: Optional[Union[str, torch.dtype]],
     model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: int = 0,
+    seed: Optional[int] = None,
     device: Optional[str] = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     from vllm.platforms import current_platform
@@ -826,7 +834,7 @@ def create_kv_caches_with_random(
     head_size: int,
     cache_dtype: Optional[Union[str, torch.dtype]],
     model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: int = 0,
+    seed: Optional[int] = None,
     device: Optional[str] = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
 
@@ -1127,7 +1135,7 @@ def current_stream() -> torch.cuda.Stream:
     return _current_stream
 
 
-def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
+def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
     """
@@ -2144,7 +2152,7 @@ def measure(self):
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
-    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+    def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             cuda_memory=self.cuda_memory - other.cuda_memory,
@@ -2279,12 +2287,12 @@ def get_exception_traceback():
 def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
-    type: Any,
+    socket_type: Any,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
     mem = psutil.virtual_memory()
-    socket = ctx.socket(type)
+    socket = ctx.socket(socket_type)
 
     # Calculate buffer size based on system memory
     total_mem = mem.total / 1024**3
@@ -2298,29 +2306,27 @@ def make_zmq_socket(
     else:
         buf_size = -1  # Use system default buffer size
 
-    if type == zmq.constants.PULL:
+    if socket_type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)
         socket.setsockopt(zmq.constants.RCVBUF, buf_size)
         socket.connect(path)
-    elif type == zmq.constants.PUSH:
+    elif socket_type == zmq.constants.PUSH:
         socket.setsockopt(zmq.constants.SNDHWM, 0)
         socket.setsockopt(zmq.constants.SNDBUF, buf_size)
         socket.bind(path)
     else:
-        raise ValueError(f"Unknown Socket Type: {type}")
+        raise ValueError(f"Unknown Socket Type: {socket_type}")
 
     return socket
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(
-        path: str,
-        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+    ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        yield make_zmq_socket(ctx, path, type)
+        yield make_zmq_socket(ctx, path, socket_type)
 
     except KeyboardInterrupt:
         logger.debug("Got Keyboard Interrupt.")
@@ -2365,8 +2371,8 @@ def bind_kv_cache(
     from vllm.model_executor.models.utils import extract_layer_index
     layer_need_kv_cache = [
         layer_name for layer_name in ctx
-        if ctx[layer_name].attn_type in (AttentionType.DECODER,
-                                         AttentionType.ENCODER_DECODER)
+        if (hasattr(ctx[layer_name], 'attn_type') and ctx[layer_name].attn_type
+            in (AttentionType.DECODER, AttentionType.ENCODER_DECODER))
     ]
     layer_index_sorted = sorted(
         set(
@@ -2475,3 +2481,70 @@ def wrapped_init(self, *args, **kwargs) -> None:
 
     type.__setattr__(cls, '__init__', wrapped_init)
     return cls
+
+
+class LazyLoader(types.ModuleType):
+    """
+    LazyLoader module borrowed from Tensorflow
+    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
+    with a addition of "module caching".
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+    Modules such as `xgrammar` might do additional side effects, so we
+    only want to use this when it is needed, delaying all eager effects
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: dict[str, Any],
+        name: str,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._module: types.ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> types.ModuleType:
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules
+            # ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise err from None
+
+        # Update this object's dict so that if someone keeps a
+        # reference to the LazyLoader, lookups are efficient
+        # (__getattr__ is only called on lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> list[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
+
+
+def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
+    """
+    Helper function to swap values for two keys
+    """
+    v1 = obj.get(key1)
+    v2 = obj.get(key2)
+    if v1 is not None:
+        obj[key2] = v1
+    else:
+        obj.pop(key2, None)
+    if v2 is not None:
+        obj[key1] = v2
+    else:
+        obj.pop(key1, None)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 8bf7f3587bc0..db80e52bf073 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -7,7 +7,8 @@
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
@@ -100,8 +101,8 @@ def __init__(self, runner: "GPUModelRunner"):
         self.runner = runner
 
     def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput"):
-        pass
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
@@ -180,6 +181,9 @@ def __init__(
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttention V1 with FP8 KV cache not yet supported")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 824ffcfd61ba..0b0f521672b0 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -219,11 +219,10 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import (
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.utils import cdiv, round_down
 
 try:
@@ -275,17 +274,47 @@ def use_cascade_attention(*args, **kwargs) -> bool:
 
 
 @dataclass
-class MLACommonMetadata:
-    """Metadata for MLACommon.
+class MLACommonPrefillMetadata:
+    """ Prefill Specific Metadata """
+
+    @dataclass
+    class ChunkedContextMetadata:
+        # New for MLA (compared to FlashAttention)
+        # For handling chunked prefill
+        cu_seq_lens: torch.Tensor
+        starts: torch.Tensor
+        seq_tot: list[int]
+        max_seq_lens: list[int]
+        workspace: torch.Tensor
 
-    NOTE: Please read the comment at the top of the file before trying to
-    understand this class
-    """
-    # New for MLA (compared to FlashAttention)
     # Input positions for rotrary embeddings since for MLA the rotary
     # position embeddings are applied inside the attention backend
     input_positions: torch.Tensor
+    block_table: torch.Tensor
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    chunked_context: Optional[ChunkedContextMetadata] = None
+
+
+@dataclass
+class MLACommonDecodeMetadata:
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+    block_table: torch.Tensor
+    seq_lens: torch.Tensor
+
 
+D = TypeVar("D", bound=MLACommonDecodeMetadata)
+
+
+@dataclass
+class MLACommonMetadata(Generic[D]):
+    """Metadata for MLACommon.
+
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -295,30 +324,23 @@ class MLACommonMetadata:
     #                                   |-- query_len ---|
 
     num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
     query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
     slot_mapping: torch.Tensor
 
+    # New for MLA (compared to FlashAttention)
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
     # The dimension of the attention heads
     head_dim: Optional[int] = None
 
-    # New for MLA (compared to FlashAttention)
-    # For chunked prefill
-    num_decodes: Optional[int] = None
-    num_decode_tokens: Optional[int] = None
-    num_prefills: Optional[int] = None
-    has_context: bool = False
-    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
-    context_chunk_starts: Optional[torch.Tensor] = None
-    context_chunk_seq_tot: Optional[list[int]] = None
-    context_chunk_max_seq_lens: Optional[list[int]] = None
-    chunked_prefill_workspace: Optional[torch.Tensor] = None
+    decode: Optional[D] = None
+    prefill: Optional[MLACommonPrefillMetadata] = None
 
     def __post_init__(self):
         supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
@@ -329,10 +351,10 @@ def __post_init__(self):
                 f"received {self.head_dim}.")
 
 
-T = TypeVar("T", bound=MLACommonMetadata)
+M = TypeVar("M", bound=MLACommonMetadata)
 
 
-class MLACommonMetadataBuilder(Generic[T]):
+class MLACommonMetadataBuilder(Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -340,8 +362,9 @@ class MLACommonMetadataBuilder(Generic[T]):
 
     def __init__(self,
                  runner: "GPUModelRunner",
-                 cls: Optional[type[T]] = None):
-        self.cls = cls if cls is not None else MLACommonMetadata
+                 metadata_cls: Optional[type[M]] = None):
+        self.metadata_cls = metadata_cls \
+            if metadata_cls is not None else MLACommonMetadata
         self.runner = runner
         scheduler_config = runner.scheduler_config
         model_config = runner.model_config
@@ -375,7 +398,7 @@ def __init__(self,
             self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput"):
+                      scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are and
         # the front and the "prefill" requests are at the using the least amount
         # swaps possible. (NOTE for now we loosely use "decode" to mean requests
@@ -413,6 +436,7 @@ def reorder_batch(self, input_batch: "InputBatch",
         num_decodes = len(decodes)
         num_prefills = len(prefills)
         first_prefill = 0
+        modified_batch = False
 
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
@@ -421,6 +445,7 @@ def reorder_batch(self, input_batch: "InputBatch",
                 input_batch.swap_states(prefills[first_prefill],
                                         decodes[num_decodes - i])
                 first_prefill += 1
+                modified_batch = True
             else:
                 break
 
@@ -432,10 +457,21 @@ def reorder_batch(self, input_batch: "InputBatch",
         self._num_decode_tokens = num_decode_tokens
         self._num_prefill_tokens = num_prefill_tokens
 
+        return modified_batch
+
+    def _build_decode(self, input_positions: torch.Tensor,
+                      block_table: torch.Tensor, seq_lens: torch.Tensor):
+        return MLACommonDecodeMetadata(
+            input_positions=input_positions,
+            block_table=block_table,
+            seq_lens=seq_lens,
+        )
+
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int) -> T:
+              common_prefix_len: int) -> M:
+        assert self._num_decodes + self._num_prefills == num_reqs
+
         device = self.runner.device
-        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
         query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
             device, non_blocking=True)
         seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(device,
@@ -447,85 +483,103 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
             device, non_blocking=True).long()
 
-        context_chunk_cu_seq_lens = None
-        context_chunk_starts = None
-        context_chunk_seq_tot = None
-        context_chunk_max_seq_lens = None
-
-        num_computed_tokens_cpu_tensor = \
-            self.runner.input_batch.num_computed_tokens_cpu_tensor[:num_reqs]
-        context_lens_tensor = \
-            num_computed_tokens_cpu_tensor.to(device, non_blocking=True)
-
-        if self.chunked_prefill_enabled and self._num_prefills > 0 \
-            and context_lens_tensor[self._num_decodes:].max() > 0:
-            # NOTE: it is recommend you read the `Chunked Prefill` section in
-            # the comment at the top of the file before trying to understand
-            # the following code
-
-            self.has_context = True
-
-            num_prefills_with_context = \
-                (context_lens_tensor[self._num_decodes:] > 0).sum().item()
-
-            # currently we allocate an equal amount of workspace for each
-            # prefill in the batch, we could probably use a more advanced
-            # algorithm here and allocate more workspace to prefills with
-            # longer context lengths
-            max_context_chunk = \
-                self.chunked_prefill_workspace_size // num_prefills_with_context
-
-            # align max_context_chunk to page_size by rounding down,
-            # currently the `gather_cache` kernel cannot handle
-            # `context_chunk_starts` that are not aligned to page_size
-            max_context_chunk = round_down(max_context_chunk, self.page_size)
-            assert max_context_chunk > 0
-            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
-
-            # if `max_context_chunk = 256`, `num_chunks = 3`, and
-            #   `num_prefills_with_context = 4`, create a tensor that looks like
-            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
-            context_chunk_starts = \
-                torch.arange(num_chunks, device=device, dtype=torch.int32) \
-                .unsqueeze(1).expand(-1, self._num_prefills) \
-                * max_context_chunk
-            chunk_ends = torch.min(context_lens_tensor[self._num_decodes:] \
-                .unsqueeze(0), context_chunk_starts + max_context_chunk)
-            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
-            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
-                torch.int32)
-            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device) \
-                .unsqueeze(-1)
-            context_chunk_cu_seq_lens = \
-                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
-            context_chunk_max_seq_lens = \
-                chunk_seq_lens.max(dim=1).values.tolist()
-            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
-            assert max(context_chunk_seq_tot) <= \
-                self.chunked_prefill_workspace_size
-
-        return self.cls(
-            input_positions=input_positions,
+        prefill_metadata = None
+        if self._num_prefills > 0:
+            reqs_start = self._num_decodes  # prefill_start
+            tokens_start = self._num_decode_tokens
+
+            context_lens_cpu = self.runner.input_batch.\
+                num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
+            context_lens = context_lens_cpu.to(device, non_blocking=True)
+
+            chunked_context_metadata = None
+            if self.chunked_prefill_enabled and self._num_prefills > 0 \
+                and context_lens.max() > 0:
+                # NOTE: it is recommend you read the `Chunked Prefill` section
+                # in the comment at the top of the file before trying to
+                # understand the following code
+
+                num_prefills_with_context = (context_lens > 0).sum().item()
+
+                # currently we allocate an equal amount of workspace for each
+                # prefill in the batch, we could probably use a more advanced
+                # algorithm here and allocate more workspace to prefills with
+                # longer context lengths
+                max_context_chunk = \
+                    self.chunked_prefill_workspace_size \
+                        // num_prefills_with_context
+
+                # align max_context_chunk to page_size by rounding down,
+                # currently the `gather_cache` kernel cannot handle
+                # `context_chunk_starts` that are not aligned to page_size
+                max_context_chunk = round_down(max_context_chunk,
+                                               self.page_size)
+
+                assert max_context_chunk > 0
+                num_chunks = cdiv(context_lens.max(), max_context_chunk)
+
+                # if `max_context_chunk = 256`, `num_chunks = 3`, and
+                #   `num_prefills_with_context = 4`, create a tensor that looks
+                # like
+                #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+                chunk_starts = \
+                    torch.arange(num_chunks, device=device, dtype=torch.int32) \
+                    .unsqueeze(1).expand(-1, self._num_prefills) \
+                    * max_context_chunk
+                chunk_ends = torch.min(context_lens.unsqueeze(0),
+                                       chunk_starts + max_context_chunk)
+                chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
+                _chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
+                    torch.int32)
+                zero = torch.zeros(num_chunks,
+                                   dtype=torch.int32,
+                                   device=device).unsqueeze(-1)
+
+                chunked_context_metadata = \
+                    MLACommonPrefillMetadata.ChunkedContextMetadata(
+                    cu_seq_lens=torch.cat(
+                        [zero, _chunk_cu_seq_lens], dim=1),
+                    starts=chunk_starts,
+                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                    workspace=self.chunked_prefill_workspace,
+                )
+
+                assert max(chunked_context_metadata.max_seq_lens) <= \
+                    self.chunked_prefill_workspace_size
+
+            prefill_metadata = MLACommonPrefillMetadata(
+                input_positions=input_positions[tokens_start:],
+                block_table=block_table[reqs_start:, ...],
+                query_start_loc=query_start_loc[reqs_start:] -
+                query_start_loc[reqs_start],
+                max_query_len=seq_lens[reqs_start:].max().item(),
+                chunked_context=chunked_context_metadata,
+            )
+
+        decode_metadata = None
+        if self._num_decodes > 0:
+            decode_metadata = self._build_decode(
+                input_positions=input_positions[:self._num_decode_tokens],
+                block_table=block_table[:self._num_decodes, ...],
+                seq_lens=seq_lens[:self._num_decodes],
+            )
+
+        return self.metadata_cls(
             num_actual_tokens=num_actual_tokens,
-            max_query_len=max_query_len,
             query_start_loc=query_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table,
             slot_mapping=slot_mapping,
             head_dim=self.runner.model_config.get_head_size(),
             # MLACommonMetadata Chunk prefill specific
             num_decodes=self._num_decodes,
             num_decode_tokens=self._num_decode_tokens,
             num_prefills=self._num_prefills,
-            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
-            context_chunk_starts=context_chunk_starts,
-            context_chunk_seq_tot=context_chunk_seq_tot,
-            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
+            prefill=prefill_metadata,
+            decode=decode_metadata,
         )
 
 
-class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -571,13 +625,17 @@ def __init__(
         self.qk_head_dim = qk_head_dim
         self.v_head_dim = v_head_dim
 
-        self.rotary_emb = rotary_emb
-        self.use_yarn_rope = isinstance(rotary_emb,
-                                        DeepseekScalingRotaryEmbedding)
+        # Hack for V1 for now to avoid torch library overhead (since we are
+        # already inside an attention custom op), pull out the forward
+        # method from the rotary embedding and call it directly
+        # TODO(lucas): we should probably find a cleaner way to do this
+        self.rotary_emb = rotary_emb._forward_method
+
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
+        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -591,7 +649,7 @@ def __init__(
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_UV_O):
-                output_parallel = apply_fp8_linear_generic(
+                output_parallel = self.fp8_linear_generic.apply(
                     x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape)
@@ -611,7 +669,7 @@ def _v_up_proj_and_o_proj(self, x):
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_Q_UK):
-                return apply_fp8_linear_generic(
+                return self.fp8_linear_generic.apply(
                     x, self.W_Q_UK, self.W_Q_UK_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape).view(
@@ -798,32 +856,28 @@ def _compute_prefill_context(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
     ):
-        assert attn_metadata.num_prefills is not None
-        assert attn_metadata.context_chunk_seq_tot is not None
-        assert attn_metadata.context_chunk_cu_seq_lens is not None
-        assert attn_metadata.context_chunk_starts is not None
-        assert attn_metadata.context_chunk_max_seq_lens is not None
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
 
         output = None
-        iters = len(attn_metadata.context_chunk_seq_tot)
-
-        assert attn_metadata.chunked_prefill_workspace is not None
-        workspace = attn_metadata.chunked_prefill_workspace
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
 
         for i in range(iters):
-            toks = attn_metadata.context_chunk_seq_tot[i]
+            toks = prefill_metadata.chunked_context.seq_tot[i]
 
             ops.gather_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
-                block_table=attn_metadata.block_table,
-                cu_seq_lens=attn_metadata.context_chunk_cu_seq_lens[i],
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
                 batch_size=attn_metadata.num_prefills,
-                seq_starts=attn_metadata.context_chunk_starts[i],
+                seq_starts=prefill_metadata.chunked_context.starts[i],
             )
 
             kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank].unsqueeze(1)
+                [..., :self.kv_lora_rank]
             k_pe = workspace[:toks]\
                 [..., self.kv_lora_rank:].unsqueeze(1)
 
@@ -845,10 +899,10 @@ def _compute_prefill_context(
                 q=q,
                 k=k,
                 v=v_padded,
-                cu_seqlens_q=attn_metadata.query_start_loc,
-                cu_seqlens_k=attn_metadata.context_chunk_cu_seq_lens[i],
-                max_seqlen_q=attn_metadata.max_query_len,
-                max_seqlen_k=attn_metadata.context_chunk_max_seq_lens[i],
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.chunked_context.cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.chunked_context.max_seq_lens[i],
                 softmax_scale=self.scale,
                 causal=False,  # Context is unmasked
                 return_softmax_lse=True,
@@ -881,7 +935,9 @@ def _forward_prefill(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
-        has_context = attn_metadata.has_context
+        assert attn_metadata.prefill is not None
+
+        has_context = attn_metadata.prefill.chunked_context is not None
         kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
             -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv_nope\
@@ -898,10 +954,10 @@ def _forward_prefill(
             q=q,
             k=k,
             v=v_padded,
-            cu_seqlens_q=attn_metadata.query_start_loc,
-            cu_seqlens_k=attn_metadata.query_start_loc,
-            max_seqlen_q=attn_metadata.max_query_len,
-            max_seqlen_k=attn_metadata.max_seq_len,
+            cu_seqlens_q=attn_metadata.prefill.query_start_loc,
+            cu_seqlens_k=attn_metadata.prefill.query_start_loc,
+            max_seqlen_q=attn_metadata.prefill.max_query_len,
+            max_seqlen_k=attn_metadata.prefill.max_query_len,
             softmax_scale=self.scale,
             causal=True,
             return_softmax_lse=has_context,
@@ -934,7 +990,7 @@ def _forward_decode(
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: T,
+        attn_metadata: M,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -945,7 +1001,7 @@ def forward(
         k_c_normed: torch.Tensor,  # key in unified attn
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
-        attn_metadata: T,
+        attn_metadata: M,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
@@ -966,7 +1022,6 @@ def forward(
 
         # Restore head dim (for rotary embedding)
         k_pe = k_pe.unsqueeze(1)
-        assert hasattr(attn_metadata, "input_positions")
 
         assert attn_metadata.num_decodes is not None and \
             attn_metadata.num_prefills is not None and \
@@ -978,28 +1033,27 @@ def forward(
 
         decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
         decode_k_pe = k_pe[:num_decode_tokens]
-        decode_input_positions = \
-            attn_metadata.input_positions[:num_decode_tokens]
 
         prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
         prefill_k_pe = k_pe[num_decode_tokens:]
-        prefill_input_positions = \
-            attn_metadata.input_positions[num_decode_tokens:]
         prefill_k_c_normed = k_c_normed[num_decode_tokens:]
 
         if has_decode:
+            assert attn_metadata.decode is not None
             decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
-                decode_input_positions, decode_q_pe, decode_k_pe)
+                attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe)
 
         if has_prefill:
+            assert attn_metadata.prefill is not None
             prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
                 .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
             prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
-                prefill_input_positions, prefill_q_pe, prefill_k_pe)
+                attn_metadata.prefill.input_positions, prefill_q_pe,
+                prefill_k_pe)
 
         # write the latent and rope to kv cache
         if kv_cache.numel() > 0:
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index b357d7142410..143bfe35bb5e 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -5,12 +5,14 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
@@ -38,34 +40,41 @@ def get_impl_cls() -> type["FlashMLAImpl"]:
 
 
 @dataclass
-class FlashMLAMetadata(MLACommonMetadata):
-    decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor,
-                                                   torch.Tensor]] = None
-    decode_num_splits: Optional[torch.Tensor] = None
+class FlashMLADecodeMetadata(MLACommonDecodeMetadata):
+    tile_scheduler_metadata: tuple[torch.Tensor, torch.Tensor]
+    num_splits: torch.Tensor
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
+    pass
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
     def __init__(self, runner):
-        super().__init__(runner, cls=FlashMLAMetadata)
+        super().__init__(runner)
 
         self.num_q_heads = self.runner.model_config.get_num_attention_heads(
             self.runner.parallel_config)
 
-    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int):
-        m = super().build(num_reqs, num_actual_tokens, max_query_len,
-                          common_prefix_len)
-
-        if m.num_decode_tokens is not None and m.num_decode_tokens > 0:
-            m.decode_tile_scheduler_metadata, m.decode_num_splits = \
-                get_mla_metadata(
-                m.seq_lens[:m.num_decode_tokens],
-                self.num_q_heads,
-                1, # MQA for the decode path
-            )
+    def _build_decode(self, input_positions: torch.Tensor,
+                      block_table: torch.Tensor,
+                      seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
+        tile_scheduler_metadata, num_splits = \
+            get_mla_metadata(
+            seq_lens,
+            self.num_q_heads,
+            1, # MQA for the decode path
+        )
 
-        return m
+        return FlashMLADecodeMetadata(
+            input_positions=input_positions,
+            block_table=block_table,
+            seq_lens=seq_lens,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
+        )
 
 
 class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
@@ -107,6 +116,10 @@ def __init__(
                                       "are not implemented for "
                                       "FlashMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA V1 with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -115,8 +128,7 @@ def _forward_decode(
         attn_metadata: FlashMLAMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 FlashMLA not yet supported")
+        assert attn_metadata.decode is not None
 
         q = torch.cat([q_nope, q_pe], dim=-1)\
             .unsqueeze(1) # Add seqlen dim of 1 (decode)
@@ -124,14 +136,12 @@ def _forward_decode(
         o, _ = flash_mla_with_kvcache(
             q=q,
             k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
-            block_table=attn_metadata.block_table[:attn_metadata.num_decodes,
-                                                  ...],
-            cache_seqlens=attn_metadata.seq_lens[:attn_metadata.
-                                                 num_decode_tokens],
+            block_table=attn_metadata.decode.block_table,
+            cache_seqlens=attn_metadata.decode.seq_lens,
             head_dim_v=self.kv_lora_rank,
-            tile_scheduler_metadata=attn_metadata.
-            decode_tile_scheduler_metadata,
-            num_splits=attn_metadata.decode_num_splits,
+            tile_scheduler_metadata=attn_metadata.decode.
+            tile_scheduler_metadata,
+            num_splits=attn_metadata.decode.num_splits,
             softmax_scale=self.scale,
             causal=True,
         )
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 3f9b349a5f04..8e7e4f10b81b 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
@@ -61,6 +62,10 @@ def __init__(
                                       "are not implemented for "
                                       "TritonMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA V1 with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -69,6 +74,8 @@ def _forward_decode(
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError("FP8 Triton MLA not yet supported")
 
@@ -104,7 +111,8 @@ def _forward_decode(
 
         # Run MQA
         decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
-                             attn_metadata.block_table, attn_metadata.seq_lens,
-                             attn_logits, num_kv_splits, self.scale, PAGE_SIZE)
+                             attn_metadata.decode.block_table,
+                             attn_metadata.decode.seq_lens, attn_logits,
+                             num_kv_splits, self.scale, PAGE_SIZE)
 
         return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bf4a05daf2d5..bbbdf50ac0cc 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -11,7 +11,8 @@
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
-NUM_QUERIES_PER_BLOCK = 16
+# These are the 2 tunable parameters of the paged attention Pallas kernel.
+NUM_QUERIES_PER_BLOCK = 32
 NUM_KV_PAGES_PER_BLOCK = 128
 
 
@@ -40,7 +41,7 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_kv_heads, num_blocks, block_size, head_size)
+        return (num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -114,6 +115,17 @@ def __init__(
                                       "are not implemented for "
                                       "PallasAttentionBackendImpl")
 
+        tpu_version = torch_xla.tpu.version()
+        if tpu_version < 4:
+            raise NotImplementedError("TPU version must be 4 or higher.")
+        # NOTE(chengjiyao): the TPU v4's vmem capacity is 16MB
+        # TODO(chengjiyao): autotune NUM_QUERIES_PER_BLOCK,
+        # NUM_KV_PAGES_PER_BLOCK and vmem_limit_bytes
+        if tpu_version == 4:
+            self.vmem_limit_bytes = 16 * 1024 * 1024
+        else:
+            self.vmem_limit_bytes = 64 * 1024 * 1024
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -130,8 +142,8 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_kv_heads, num_blocks, block_size, head_size], 
-                        [num_kv_heads, num_blocks, block_size, head_size])
+            kv_cache = ([num_blocks, block_size, num_kv_heads, head_size], 
+                        [num_blocks, block_size, num_kv_heads, head_size])
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -153,7 +165,6 @@ def forward(
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
 
-        query = query * self.scale
         output = torch.ops.xla.ragged_paged_attention(
             query,
             key_cache,
@@ -164,8 +175,9 @@ def forward(
             attn_metadata.num_seqs,
             num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
             num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            use_kernel=False,
-        )
+            vmem_limit_bytes=self.vmem_limit_bytes,
+            use_kernel=True,
+            sm_scale=self.scale)
 
         return output.reshape(num_tokens, hidden_size)
 
@@ -182,16 +194,15 @@ def write_to_kv_cache(
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
         value: shape = [num_tokens, num_kv_heads, head_size]
-        k_cache = [num_kv_heads, num_blocks, block_size, head_size]
-        v_cache = [num_kv_heads, num_blocks, block_size, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
 
     """
     torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
     torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
 
-    key = key.flatten(0, 1)
-    value = value.flatten(0, 1)
-    key_cache = key_cache.flatten(0, 2)
-    value_cache = value_cache.flatten(0, 2)
+    key_cache = key_cache.flatten(0, 1)
+    value_cache = value_cache.flatten(0, 1)
+    slot_mapping = slot_mapping.flatten()
     key_cache.index_copy_(0, slot_mapping, key)
     value_cache.index_copy_(0, slot_mapping, value)
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index a625d99f4a15..28093646e9fb 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -6,8 +6,9 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.flash_attn import (
     FlashAttentionMetadata, FlashAttentionMetadataBuilder)
@@ -109,6 +110,7 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
+        fp8_out_scale: Optional[torch.Tensor],
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -156,20 +158,22 @@ def forward(
         )
 
         # Compute attention and update output up to `num_actual_tokens`.
-        context_attention_fwd(q=query[:num_actual_tokens],
-                              k=key[:num_actual_tokens],
-                              v=value[:num_actual_tokens],
-                              o=output[:num_actual_tokens],
-                              kv_cache_dtype=self.kv_cache_dtype,
-                              k_cache=key_cache,
-                              v_cache=value_cache,
-                              b_loc=attn_metadata.block_table,
-                              b_start_loc=attn_metadata.query_start_loc,
-                              b_seq_len=attn_metadata.seq_lens,
-                              max_input_len=attn_metadata.max_query_len,
-                              k_scale=layer._k_scale,
-                              v_scale=layer._v_scale,
-                              alibi_slopes=self.alibi_slopes,
-                              sliding_window=self.sliding_window[0],
-                              sm_scale=self.scale)
+        chunked_prefill_paged_decode(
+            query=query[:num_actual_tokens],
+            key=key[:num_actual_tokens],
+            value=value[:num_actual_tokens],
+            output=output[:num_actual_tokens],
+            kv_cache_dtype=self.kv_cache_dtype,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            block_table=attn_metadata.block_table,
+            query_start_loc=attn_metadata.query_start_loc,
+            seq_lens=attn_metadata.seq_lens,
+            max_query_len=attn_metadata.max_query_len,
+            k_scale=layer._k_scale,
+            v_scale=layer._v_scale,
+            alibi_slopes=self.alibi_slopes,
+            sliding_window=self.sliding_window[0],
+            sm_scale=self.scale)
+
         return output
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 6c6be01a2ff7..5cfe2b96865a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -105,8 +105,6 @@ def get_computed_blocks(
             # Prefix caching is disabled.
             return [], 0
 
-        computed_blocks = []
-
         # The block hashes for the request may already be computed
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
@@ -114,24 +112,31 @@ def get_computed_blocks(
             block_hashes = hash_request_tokens(self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        for block_hash in block_hashes:
-            # block_hashes is a chain of block hashes. If a block hash is not
-            # in the cached_block_hash_to_id, the following block hashes are
-            # not computed yet for sure.
-            if cached_block := self.block_pool.get_cached_block(block_hash):
-                computed_blocks.append(cached_block)
-            else:
-                break
-
         self.prefix_cache_stats.requests += 1
-        self.prefix_cache_stats.queries += len(block_hashes)
-        self.prefix_cache_stats.hits += len(computed_blocks)
-
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
-        return computed_blocks, num_computed_tokens
+        if request.sampling_params.prompt_logprobs is None:
+            # Check for cache hits
+            computed_blocks = []
+            for block_hash in block_hashes:
+                # block_hashes is a chain of block hashes. If a block hash
+                # is not in the cached_block_hash_to_id, the following
+                # block hashes are not computed yet for sure.
+                if cached_block := self.block_pool.get_cached_block(
+                        block_hash):
+                    computed_blocks.append(cached_block)
+                else:
+                    break
+
+            self.prefix_cache_stats.queries += len(block_hashes)
+            self.prefix_cache_stats.hits += len(computed_blocks)
+
+            # NOTE(woosuk): Since incomplete blocks are not eligible for
+            # sharing, `num_computed_tokens` is always a multiple of
+            # `block_size`.
+            num_computed_tokens = len(computed_blocks) * self.block_size
+            return computed_blocks, num_computed_tokens
+        else:
+            # Skip cache hits for prompt logprobs
+            return [], 0
 
     def allocate_slots(
         self,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index db14c9455a1f..a7e50f8f40ec 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import time
 from collections import deque
 from collections.abc import Iterable
@@ -18,6 +20,7 @@
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
 
@@ -32,12 +35,14 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         speculative_config: Optional[SpeculativeConfig],
         log_stats: bool,
+        structured_output_manager: StructuredOutputManager,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
         self.speculative_config = speculative_config
         self.log_stats = log_stats
+        self.structured_output_manager = structured_output_manager
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -97,7 +102,7 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
-    def schedule(self) -> "SchedulerOutput":
+    def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
         # Each request just has the num_computed_tokens and
@@ -114,6 +119,14 @@ def schedule(self) -> "SchedulerOutput":
         scheduled_running_reqs: list[Request] = []
         preempted_reqs: list[Request] = []
 
+        # NOTE: structured_output_request_ids maps
+        # a request's (request that uses structured output)
+        # request_id to the running request index.
+        # This will helps us determine to slice the grammar bitmask
+        # and only applies valid mask for requests that
+        # uses structured decoding.
+        structured_output_request_ids: dict[str, int] = {}
+
         req_to_new_block_ids: dict[str, list[int]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
@@ -184,6 +197,12 @@ def schedule(self) -> "SchedulerOutput":
             # Schedule the request.
             scheduled_running_reqs.append(request)
             self.scheduled_req_ids.add(request.request_id)
+            if request.use_structured_output:
+                # PERF: in case of chunked prefill,
+                # request might not include any new tokens.
+                # Therefore, we might introduce some additional
+                # cycle to fill in the bitmask, which could be a big no-op.
+                structured_output_request_ids[request.request_id] = req_index
             req_to_new_block_ids[request.request_id] = [
                 b.block_id for b in new_blocks
             ]
@@ -219,6 +238,10 @@ def schedule(self) -> "SchedulerOutput":
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(requested_loras) <= self.lora_config.max_loras
 
+        # Use a temporary deque to collect requests that need to be skipped
+        # and put back at the head of the waiting queue later
+        waiting_for_fsm: deque[Request] = deque()
+
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting and token_budget > 0:
@@ -227,6 +250,16 @@ def schedule(self) -> "SchedulerOutput":
 
                 request = self.waiting[0]
 
+                if request.status == RequestStatus.WAITING_FOR_FSM:
+                    structured_output_req = request.structured_output_request
+                    if structured_output_req and structured_output_req.grammar:
+                        request.status = RequestStatus.WAITING
+                    else:
+                        waiting_structured_output_req = self.waiting.popleft()
+                        waiting_for_fsm.appendleft(
+                            waiting_structured_output_req)
+                        continue
+
                 # Check that adding the request still respects the max_loras
                 # constraint.
                 if self.lora_config and request.lora_request:
@@ -281,6 +314,10 @@ def schedule(self) -> "SchedulerOutput":
                     break
 
                 self.waiting.popleft()
+                if request.use_structured_output:
+                    structured_output_request_ids[
+                        request.request_id] = req_index
+                req_index += 1
                 self.running.append(request)
                 self.scheduled_req_ids.add(request.request_id)
                 self.request_scheduled(request, scheduled_timestamp)
@@ -311,6 +348,10 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
+        # Put back any skipped requests at the head of the waiting queue
+        if waiting_for_fsm:
+            self.waiting.extendleft(waiting_for_fsm)
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -331,6 +372,11 @@ def schedule(self) -> "SchedulerOutput":
                 self.kv_cache_manager.get_num_common_prefix_blocks(
                     any_request, len(self.running)))
 
+        grammar_bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            len(self.running),
+        )
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
@@ -369,6 +415,8 @@ def schedule(self) -> "SchedulerOutput":
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
             free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+            structured_output_request_ids=structured_output_request_ids,
+            grammar_bitmask=grammar_bitmask,
         )
 
         self.finished_req_ids = set()
@@ -381,7 +429,7 @@ def _make_cached_request_data(
         num_scheduled_spec_tokens: int,
         new_block_ids: list[int],
         resumed_from_preemption: bool,
-    ) -> "CachedRequestData":
+    ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
         num_computed_tokens = request.num_computed_tokens
@@ -474,8 +522,8 @@ def _try_schedule_encoder_inputs(
 
     def update_from_output(
         self,
-        scheduler_output: "SchedulerOutput",
-        model_runner_output: "ModelRunnerOutput",
+        scheduler_output: SchedulerOutput,
+        model_runner_output: ModelRunnerOutput,
     ) -> EngineCoreOutputs:
         sampled_token_ids = model_runner_output.sampled_token_ids
         spec_token_ids = model_runner_output.spec_token_ids
@@ -565,6 +613,15 @@ def update_from_output(
                     # the outer lists can be of length > 1.
                     new_logprobs = logprobs.slice(req_index, req_index + 1)
 
+            if new_token_ids and request.use_structured_output:
+                # NOTE: structured_output_request
+                # should not be None if use_structured_output, we have
+                # check above, so safe to ignore type warning
+                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                    request.request_id,
+                    new_token_ids,
+                )
+
             # Transmit partial if chunked prefill & prompt logprobs is enabled
             if new_token_ids or prompt_logprobs_tensors is not None:
                 # Add EngineCoreOutput for this Request.
@@ -625,7 +682,8 @@ def finish_requests(
         assert RequestStatus.is_finished(finished_status)
         if isinstance(request_ids, str):
             request_ids = (request_ids, )
-        request_ids = set(request_ids)
+        else:
+            request_ids = set(request_ids)
 
         for req_id in request_ids:
             request = self.requests.get(req_id)
@@ -657,6 +715,14 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def has_finished_requests(self) -> bool:
+        return len(self.finished_req_ids) > 0
+
+    def has_requests(self):
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
     def get_num_unscheduled_requests(self) -> int:
         """Number of requests that are not being processed by the executor."""
         return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index b6caa8b4ebf7..bb883acdb44b 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
     from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
     from vllm.multimodal.base import PlaceholderRange
@@ -17,20 +22,20 @@ class NewRequestData:
     req_id: str
     prompt_token_ids: list[int]
     prompt: Optional[str]
-    mm_inputs: list["MultiModalKwargs"]
+    mm_inputs: list[MultiModalKwargs]
     mm_hashes: list[str]
-    mm_positions: list["PlaceholderRange"]
-    sampling_params: "SamplingParams"
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
     block_ids: list[int]
     num_computed_tokens: int
-    lora_request: Optional["LoRARequest"]
+    lora_request: Optional[LoRARequest]
 
     @classmethod
     def from_request(
         cls,
-        request: "Request",
+        request: Request,
         block_ids: list[int],
-    ) -> "NewRequestData":
+    ) -> NewRequestData:
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
@@ -60,11 +65,11 @@ class CachedRequestData:
     @classmethod
     def from_request(
         cls,
-        request: "Request",
+        request: Request,
         resumed_from_preemption: bool,
         new_token_ids: list[int],
         new_block_ids: list[int],
-    ) -> "CachedRequestData":
+    ) -> CachedRequestData:
         return cls(
             req_id=request.request_id,
             resumed_from_preemption=resumed_from_preemption,
@@ -111,3 +116,9 @@ class SchedulerOutput:
     # list of (req_id, encoder_input_index) tuples.
     # Used to free the encoder cache.
     free_encoder_input_ids: list[tuple[str, int]]
+
+    # Dict of request ids to their index within the batch
+    # for filling the next token bitmask
+    structured_output_request_ids: dict[str, int]
+    # the bitmask for the whole batch
+    grammar_bitmask: Optional[npt.NDArray[np.int32]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 954f74c3fdae..3dc513a72833 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import logging
 import os
 from collections.abc import AsyncGenerator, Mapping
 from typing import Optional, Union
@@ -57,10 +58,9 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers: list[StatLoggerBase] = []
         if self.log_stats:
-            self.stat_loggers.extend([
-                LoggingStatLogger(),
-                PrometheusStatLogger(vllm_config),
-            ])
+            if logger.isEnabledFor(logging.INFO):
+                self.stat_loggers.append(LoggingStatLogger())
+            self.stat_loggers.append(PrometheusStatLogger(vllm_config))
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -72,9 +72,7 @@ def __init__(
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
+            vllm_config=vllm_config,
             tokenizer=self.tokenizer,
             input_registry=input_registry,
         )
@@ -194,8 +192,8 @@ async def generate(
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
-        A separate output_handler loop runs in a background AsyncIO task, 
-        pulling outputs from EngineCore and putting them into the 
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
         per-request AsyncStream.
 
         The caller of generate() iterates the returned AsyncGenerator,
@@ -255,13 +253,14 @@ async def _run_output_handler(self):
             while True:
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
+                num_outputs = len(outputs.outputs)
 
-                iteration_stats = IterationStats() if self.log_stats else None
+                iteration_stats = IterationStats() if (
+                    self.log_stats and num_outputs) else None
 
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                 # event loop for too long.
-                num_outputs = len(outputs.outputs)
                 if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
                     slices = (outputs.outputs, )
                 else:
@@ -287,7 +286,7 @@ async def _run_output_handler(self):
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
-                self._log_stats(
+                self._record_stats(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
                 )
@@ -306,7 +305,7 @@ async def abort(self, request_id: str) -> None:
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
-    def _log_stats(
+    def _record_stats(
         self,
         scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
@@ -315,10 +314,9 @@ def _log_stats(
             return
 
         assert scheduler_stats is not None
-        assert iteration_stats is not None
-        for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats,
-                       iteration_stats=iteration_stats)
+        for stat_logger in self.stat_loggers:
+            stat_logger.record(scheduler_stats=scheduler_stats,
+                               iteration_stats=iteration_stats)
 
     def encode(
         self,
@@ -354,7 +352,8 @@ async def do_log_stats(
         scheduler_outputs=None,
         model_output=None,
     ) -> None:
-        logger.debug("Called do_log_stats.")
+        for stat_logger in self.stat_loggers:
+            stat_logger.log()
 
     async def check_health(self) -> None:
         logger.debug("Called check_health.")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b9bf8fac40f6..bdf9203b1b1d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -29,6 +29,7 @@
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.structured_output import StructuredOutputManager
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -61,6 +62,8 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
+        self.structured_output_manager = StructuredOutputManager(vllm_config)
+
         # Setup scheduler.
         self.scheduler = Scheduler(
             scheduler_config=vllm_config.scheduler_config,
@@ -69,6 +72,7 @@ def __init__(
             lora_config=vllm_config.lora_config,
             speculative_config=vllm_config.speculative_config,
             log_stats=self.log_stats,
+            structured_output_manager=self.structured_output_manager,
         )
 
         # Setup MM Input Mapper.
@@ -131,6 +135,9 @@ def add_request(self, request: EngineCoreRequest):
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
+        if req.use_structured_output:
+            # Start grammar compilation asynchronously
+            self.structured_output_manager.populate_cache(req)
 
         self.scheduler.add_request(req)
 
@@ -146,14 +153,28 @@ def abort_requests(self, request_ids: list[str]):
     def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
-        if not self.scheduler.has_unfinished_requests():
+        # Check for any requests remaining in the scheduler - unfinished,
+        # or finished and not yet removed from the batch.
+        if not self.scheduler.has_requests():
             return EngineCoreOutputs(
-                outputs=[], scheduler_stats=self.scheduler.make_stats())
-
+                outputs=[],
+                scheduler_stats=self.scheduler.make_stats(),
+            )
         scheduler_output = self.scheduler.schedule()
+
+        # This case may occur when the only unfinished requests are
+        # structured output requests where the grammar has not finished
+        # compiling yet, so there's nothing to run.
+        if scheduler_output.total_num_scheduled_tokens == 0:
+            return EngineCoreOutputs(
+                outputs=[],
+                scheduler_stats=self.scheduler.make_stats(),
+            )
+
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)  # type: ignore
+
         return engine_core_outputs
 
     def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
@@ -316,19 +337,10 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            if not self.scheduler.has_unfinished_requests():
-                while True:
-                    try:
-                        req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
-                        self._handle_client_request(*req)
-                        break
-                    except queue.Empty:
-                        logger.debug("EngineCore busy loop waiting.")
-                        # Break out the loop so we can log_stats in step().
-                        if self.log_stats:
-                            break
-                    except BaseException:
-                        raise
+            while not self.scheduler.has_requests():
+                logger.debug("EngineCore busy loop waiting.")
+                req = self.input_queue.get()
+                self._handle_client_request(*req)
 
             # 2) Handle any new client requests.
             while not self.input_queue.empty():
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index cdce14afe0b3..d060d9778e75 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -4,6 +4,7 @@
 import os
 import queue
 import signal
+import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
@@ -18,8 +19,8 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
-                        make_zmq_socket)
+from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
+                        kill_process_tree, make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
@@ -202,10 +203,11 @@ class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: Union[zmq.Context, zmq.asyncio.Context] = None
+    ctx: Union[zmq.Context] = None
     output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
     input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
     proc_handle: Optional[BackgroundProcHandle] = None
+    shutdown_path: Optional[str] = None
 
     def __call__(self):
         """Clean up background resources."""
@@ -218,8 +220,13 @@ def __call__(self):
             self.output_socket.close(linger=0)
         if self.input_socket is not None:
             self.input_socket.close(linger=0)
-        if self.ctx is not None:
-            self.ctx.destroy(linger=0)
+        if self.shutdown_path is not None:
+            # We must ensure that the sync output socket is
+            # closed cleanly in its own thread.
+            with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+                shutdown_sender.connect(self.shutdown_path)
+                # Send shutdown signal.
+                shutdown_sender.send(b'')
 
 
 class MPClient(EngineCoreClient):
@@ -254,35 +261,37 @@ def sigusr1_handler(signum, frame):
                          "down. See stack trace above for root cause issue.")
             kill_process_tree(os.getpid())
 
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        if threading.current_thread() == threading.main_thread():
+            signal.signal(signal.SIGUSR1, sigusr1_handler)
+        else:
+            logger.warning("SIGUSR1 handler not installed because we are not "
+                           "running in the main thread. In this case the "
+                           "forked engine process may not be killed when "
+                           "an exception is raised, and you need to handle "
+                           "the engine process shutdown manually.")
 
         # Serialization setup.
         self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (
-            zmq.asyncio.Context()  # type: ignore[attr-defined]
-            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
+        sync_ctx = zmq.Context()
+        self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
         # when the client is garbage collected,  even if an
         # exception is raised mid-construction.
-        resources = BackgroundResources(ctx=self.ctx)
-        self._finalizer = weakref.finalize(self, resources)
+        self.resources = BackgroundResources(ctx=sync_ctx)
+        self._finalizer = weakref.finalize(self, self.resources)
 
-        # Paths and sockets for IPC.
-        output_path = get_open_zmq_ipc_path()
+        # Paths for IPC.
+        self.output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-        resources.output_socket = make_zmq_socket(self.ctx, output_path,
-                                                  zmq.constants.PULL)
-        resources.input_socket = make_zmq_socket(self.ctx, input_path,
-                                                 zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        resources.proc_handle = BackgroundProcHandle(
+        self.resources.proc_handle = BackgroundProcHandle(
             input_path=input_path,
-            output_path=output_path,
+            output_path=self.output_path,
             process_name="EngineCore",
             target_fn=EngineCoreProc.run_engine_core,
             process_kwargs={
@@ -291,8 +300,10 @@ def sigusr1_handler(signum, frame):
                 "log_stats": log_stats,
             })
 
-        self.output_socket = resources.output_socket
-        self.input_socket = resources.input_socket
+        # Create input socket.
+        self.resources.input_socket = make_zmq_socket(self.ctx, input_path,
+                                                      zmq.constants.PUSH)
+        self.input_socket = self.resources.input_socket
         self.utility_results: dict[int, AnyFuture] = {}
 
     def shutdown(self):
@@ -325,27 +336,48 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
-        output_socket = self.output_socket
+        ctx = self.ctx
+        output_path = self.output_path
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
 
+        shutdown_path = get_open_zmq_inproc_path()
+        self.resources.shutdown_path = shutdown_path
+
         def process_outputs_socket():
+            shutdown_socket = ctx.socket(zmq.PAIR)
+            shutdown_socket.bind(shutdown_path)
+            out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
             try:
+                poller = zmq.Poller()
+                poller.register(shutdown_socket)
+                poller.register(out_socket)
                 while True:
-                    (frame, ) = output_socket.recv_multipart(copy=False)
+                    socks = poller.poll()
+                    if not socks:
+                        continue
+                    if len(socks) == 2 or socks[0][0] == shutdown_socket:
+                        # shutdown signal, exit thread.
+                        break
+
+                    (frame, ) = out_socket.recv_multipart(copy=False)
                     outputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
-            except zmq.error.ContextTerminated:
-                # Expected when the class is GC'd / during process termination.
-                pass
+            finally:
+                # Close sockets.
+                shutdown_socket.close(linger=0)
+                out_socket.close(linger=0)
 
         # Process outputs from engine in separate thread.
-        Thread(target=process_outputs_socket, daemon=True).start()
+        self.output_queue_thread = Thread(target=process_outputs_socket,
+                                          name="EngineCoreOutputQueueThread",
+                                          daemon=True)
+        self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
         return self.outputs_queue.get()
@@ -424,10 +456,13 @@ async def _start_output_queue_task(self):
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
         self.outputs_queue = asyncio.Queue()
-        output_socket = self.output_socket
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
+        output_path = self.output_path
+        output_socket = make_zmq_socket(self.ctx, output_path,
+                                        zmq.constants.PULL)
+        self.resources.output_socket = output_socket
 
         async def process_outputs_socket():
             while True:
@@ -439,7 +474,8 @@ async def process_outputs_socket():
                 else:
                     outputs_queue.put_nowait(outputs)
 
-        self.queue_task = asyncio.create_task(process_outputs_socket())
+        self.queue_task = asyncio.create_task(process_outputs_socket(),
+                                              name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
         if self.outputs_queue is None:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4a1636f49495..92754920b62d 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from vllm.engine.output_processor.stop_checker import StopChecker
@@ -16,41 +16,46 @@
 class IncrementalDetokenizer:
 
     # Generation data
-    output_text: str
-    tokens: list[str]
     token_ids: list[int]
-    prompt_len: int
+    output_text: str = ""
+    tokens: list[str] = field(default_factory=list)
+    prompt_len: int = 0
 
     # Stop strings
-    stop: list[str]
-    include_stop_str_in_output: bool
+    stop: list[str] = field(default_factory=list)
+    include_stop_str_in_output: bool = False
 
     # Metadata for incremental detokenization
-    prefix_offset: int
-    read_offset: int
+    prefix_offset: int = 0
+    read_offset: int = 0
 
     # Parameters for detokenization
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
 
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer] = None
 
     # Accounting for stop string buffering
-    stop_buffer_length: int
+    stop_buffer_length: int = 0
     _last_output_text_offset: int = 0
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids[self.prompt_len:]
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
 
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
 
+        if tokenizer is None:
+            return cls(token_ids=[])
+
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
@@ -66,7 +71,6 @@ def from_new_request(
             stop_buffer_length = 0
 
         return cls(
-            output_text="",
             tokens=tokens,
             # Detokenizer mutates this list, so need a unique copy.
             # NOTE(Nick): could we take ownership of it though?
@@ -93,6 +97,10 @@ def update(self, new_token_ids: list[int]) -> Optional[str]:
         Return matched stop string or None.
         """
 
+        if self.tokenizer is None:
+            self.token_ids.extend(new_token_ids)
+            return None
+
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 99b97ac8e6c4..213faaa45160 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -66,9 +66,7 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(model_config=vllm_config.model_config,
-                                   cache_config=vllm_config.cache_config,
-                                   lora_config=vllm_config.lora_config,
+        self.processor = Processor(vllm_config=vllm_config,
                                    tokenizer=self.tokenizer,
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 7f572163ead4..500de14e57d6 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
+from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Optional
 
@@ -13,12 +14,15 @@
 
 logger = init_logger(__name__)
 
+NONES = itertools.repeat(None)
+
 
 @dataclass
 class LogprobsProcessor:
 
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer]
 
     # Logprobs for this request
     logprobs: Optional[SampleLogprobs]
@@ -30,7 +34,7 @@ class LogprobsProcessor:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         num_logprobs = request.sampling_params.logprobs
@@ -66,8 +70,8 @@ def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
                                              token_ids_lst):
 
             # Detokenize (non-incrementally).
-            decoded_tokens = convert_ids_list_to_tokens(
-                self.tokenizer, token_ids)
+            decoded_tokens = NONES if self.tokenizer is None else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids))
 
             # Sampler puts the sampled logprob in first.
             sampled_token_logprob = logprobs[0]
@@ -103,9 +107,9 @@ def _update_prompt_logprobs(
 
         # Detokenize non-incrementally.
         # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = convert_ids_list_to_tokens(
-            self.tokenizer,
-            token_ids.flatten().tolist())
+        decoded_tokens = None if self.tokenizer is None else (
+            convert_ids_list_to_tokens(self.tokenizer,
+                                       token_ids.flatten().tolist()))
 
         # Recover shapes.
         num_prompt_tokens, num_logprobs = logprobs.shape
@@ -121,7 +125,8 @@ def _update_prompt_logprobs(
             # Handle flattening.
             offset = pos * num_logprobs
             offset_end = offset + num_logprobs
-            decoded_tokens_for_pos = decoded_tokens[offset:offset_end]
+            decoded_tokens_for_pos = NONES \
+            if decoded_tokens is None else decoded_tokens[offset:offset_end]
 
             # Update with the Logprob dictionary for this pos.
             self.prompt_logprobs.append(
@@ -153,7 +158,7 @@ def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
     def _make_logprob_dict(
         logprobs: list[float],
         logprob_token_ids: list[int],
-        decoded_tokens: list[str],
+        decoded_tokens: Iterable[Optional[str]],
         rank: int,
         num_logprobs: int,
     ) -> dict[int, Logprob]:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 4e1d1e3bf51b..aca0233e416b 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -36,6 +36,7 @@ def __init__(
         prompt_token_ids: list[int],
         logprobs_processor: LogprobsProcessor,
         detokenizer: IncrementalDetokenizer,
+        max_tokens_param: Optional[int],
         arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
         log_stats: bool,
@@ -50,6 +51,7 @@ def __init__(
         self.prompt_len = len(prompt_token_ids)
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
         self.is_prefilling = True
         self.queue = queue
 
@@ -66,6 +68,8 @@ def from_new_request(
         queue: Optional[asyncio.Queue[RequestOutput]],
         log_stats: bool,
     ) -> "RequestState":
+        if not request.sampling_params.detokenize:
+            tokenizer = None
         return cls(
             request_id=request.request_id,
             parent_req=parent_req,
@@ -83,6 +87,8 @@ def from_new_request(
                 tokenizer=tokenizer,
                 request=request,
             ),
+            max_tokens_param=(request.sampling_params.max_tokens if
+                              request.sampling_params is not None else None),
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
@@ -198,6 +204,8 @@ def abort_requests(
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.abort_request(req_state)
+                if req_state.parent_req is not None:
+                    req_state.parent_req.finish_child_request(request_id)
 
     def add_request(
         self,
@@ -310,6 +318,8 @@ def process_outputs(
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.
                     reqs_to_abort.append(req_id)
+                if req_state.parent_req is not None:
+                    req_state.parent_req.finish_child_request(req_id)
 
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
@@ -350,5 +360,10 @@ def _update_stats_from_finished(self, req_state: RequestState,
         iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
             num_prompt_tokens=len(req_state.prompt_token_ids),
+            max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats)
         self.lora_states.finish_request(req_state)
+
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats,
+            req_state.stats.num_generation_tokens)
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index adced8973b03..4e2c78173b51 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -6,6 +6,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import IterationStats
 
 
 class ParentRequest:
@@ -18,9 +19,15 @@ class ParentRequest:
     request_id: str
     sampling_params: SamplingParams
 
+    # To track the completion of child requests
+    child_requests: set[str]
+
     # To aggregate child completions when not streaming
     output_aggregator: Optional[RequestOutput]
 
+    # To find the max number of generated tokens across all children
+    max_num_generation_tokens: int
+
     # To efficiently obtain child sampling params
     cached_child_sampling_params: Optional[SamplingParams]
 
@@ -29,7 +36,9 @@ def __init__(self, request_id: str,
         self.request_id = request_id
         self.sampling_params = sampling_params
 
+        self.child_requests = set()
         self.output_aggregator = None
+        self.max_num_generation_tokens = 0
         self.cached_child_sampling_params = None
 
     @classmethod
@@ -82,8 +91,12 @@ def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         Returns:
           (request ID, sampling_params) tuple
         """
-        return (f"{index}_{self.request_id}",
-                self._get_child_sampling_params(index))
+        child_req_id = f"{index}_{self.request_id}"
+        self.child_requests.add(child_req_id)
+        return (child_req_id, self._get_child_sampling_params(index))
+
+    def finish_child_request(self, req_id: str):
+        self.child_requests.remove(req_id)
 
     @property
     def n(self) -> int:
@@ -117,3 +130,25 @@ def make_request_output(
         request_output.outputs = sorted(request_output.outputs,
                                         key=lambda x: x.index)
         return request_output
+
+    def observe_num_generation_tokens(self, num_generation_tokens: int):
+        self.max_num_generation_tokens = max(num_generation_tokens,
+                                             self.max_num_generation_tokens)
+        return self.max_num_generation_tokens
+
+    @staticmethod
+    def observe_finished_request(parent_req: Optional['ParentRequest'],
+                                 iteration_stats: IterationStats,
+                                 num_generation_tokens: int):
+
+        n_param = parent_req.n if parent_req is not None else 1
+
+        if parent_req is not None:
+            num_generation_tokens = parent_req.observe_num_generation_tokens(
+                num_generation_tokens)
+
+        # Child requests finished, we can now record to iteration stats
+        if parent_req is None or not parent_req.child_requests:
+            iteration_stats.max_num_generation_tokens_iter.append(
+                num_generation_tokens)
+            iteration_stats.n_params_iter.append(n_param)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 3a3fc69e53e4..38638c1ee361 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -4,7 +4,7 @@
 from collections.abc import Mapping
 from typing import Optional, Union
 
-from vllm.config import CacheConfig, LoRAConfig, ModelConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -19,47 +19,46 @@
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
+from vllm.v1.structured_output.utils import validate_structured_output_request
 
 
 class Processor:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.decoding_config = vllm_config.decoding_config
         self.tokenizer = tokenizer
 
-        self.generation_config_fields = model_config.try_get_generation_config(
-        )
-        self.input_preprocessor = InputPreprocessor(model_config,
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
+        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
                                                     mm_registry)
         self.input_processor = input_registry.create_input_processor(
-            model_config)
+            self.model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_cache_client = MMInputCacheClient(model_config)
+        self.mm_input_cache_client = MMInputCacheClient(self.model_config)
 
         # Multi-modal hasher (for images)
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        self.use_hash = (
+            not self.model_config.disable_mm_preprocessor_cache) or \
+            self.cache_config.enable_prefix_caching
 
     def _validate_logprobs(
         self,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
     ) -> None:
-        if not isinstance(params, SamplingParams):
-            return
-
         max_logprobs = self.model_config.max_logprobs
         # Validate sample logprobs.
         if params.logprobs and params.logprobs > max_logprobs:
@@ -73,29 +72,68 @@ def _validate_logprobs(
                 f"Requested prompt logprobs of {params.prompt_logprobs}, "
                 f"which is greater than max allowed: {max_logprobs}")
 
-        # TODO(andy): enable this in follow up by recomputing.
-        if (params.prompt_logprobs is not None
-                and self.cache_config.enable_prefix_caching):
-            raise ValueError("Prefix caching with prompt logprobs not yet "
-                             "supported on VLLM V1.")
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        self._validate_structured_output(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError(
+                "allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Best of not yet supported.
+        if params.best_of is not None and params.best_of > 1:
+            raise ValueError("VLLM V1 does not yet support best_of.")
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError("VLLM V1 does not support per request "
+                             "user provided logits processors.")
+
+    def _validate_params(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if not isinstance(params, SamplingParams):
+            raise ValueError("V1 does not yet support Pooling models.")
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
 
     def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
 
-    def _validate_allowed_token_ids(
-        self,
-        params: Union[SamplingParams, PoolingParams],
-    ) -> None:
-        if not isinstance(params, SamplingParams):
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.guided_decoding or not self.decoding_config:
             return
-        if params.allowed_token_ids is None:
-            return
-        if not all(0 <= tid < self.model_config.vocab_size
-                   for tid in params.allowed_token_ids):
+        if self.decoding_config.guided_decoding_backend != "xgrammar":
+            raise ValueError(
+                "Only xgrammar structured output is supported in V1.")
+        if (params.guided_decoding.backend
+                and params.guided_decoding.backend != 'xgrammar'):
             raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id")
+                "Only xgrammar structured output is supported in V1.")
+        if self.vllm_config.speculative_config:
+            raise ValueError("Structured output is not supported with "
+                             "speculative decoding.")
+        validate_structured_output_request(params)
 
     def process_inputs(
         self,
@@ -112,14 +150,17 @@ def process_inputs(
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
-        self._validate_logprobs(params)
         self._validate_lora(lora_request)
-        self._validate_allowed_token_ids(params)
+        self._validate_params(params)
+        if priority != 0:
+            raise ValueError("V1 does not support priority yet.")
+        if trace_headers is not None:
+            raise ValueError("V1 does not support tracing yet.")
+        if prompt_adapter_request is not None:
+            raise ValueError("V1 does not support prompt_adapter_request.")
 
         if arrival_time is None:
             arrival_time = time.time()
-        assert priority == 0, "vLLM V1 does not support priority at the moment."
-        assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
@@ -131,6 +172,7 @@ def process_inputs(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=self.use_hash,
         )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
@@ -158,6 +200,8 @@ def process_inputs(
         sampling_params = params.clone()
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
+        sampling_params.update_from_tokenizer(
+            self.tokenizer.get_lora_tokenizer(lora_request))
 
         # Multimodal related.
         # Compute MM hashes (if enabled)
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index dfef1039fce2..1f885c10c8c3 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -23,9 +23,9 @@ class KVCacheSpecBase:
     def type_id(self) -> str:
         """
         The type identifier of this KV cache.
-        Return different strings for layers with different KV cache type (e.g., 
-        different number of tokens like full attention vs sliding window 
-        attention, different KV cache size per token like layers with different 
+        Return different strings for layers with different KV cache type (e.g.,
+        different number of tokens like full attention vs sliding window
+        attention, different KV cache size per token like layers with different
         number of heads)
 
         Returns:
@@ -59,6 +59,7 @@ class FullAttentionSpec(KVCacheSpecBase):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
+    use_mla: bool
 
     @property
     def type_id(self) -> str:
@@ -66,7 +67,9 @@ def type_id(self) -> str:
 
     @property
     def page_size_bytes(self) -> int:
-        return  2 * self.block_size * self.num_kv_heads * self.head_size \
+        # For MLA we only store a single latent vector
+        coef = 1 if self.use_mla else 2
+        return coef * self.block_size * self.num_kv_heads * self.head_size \
                 * get_dtype_size(self.dtype)
 
     def bytes_for_tokens(self, num_tokens: int) -> int:
@@ -104,7 +107,7 @@ class KVCacheConfig:
     2. (not implemented yet) A model with the same number of full attention
     layers and sliding window attention layers: two groups, one for full
     attention layers and one for sliding window attention layers.
-    3. (not implemented yet) A model with 2 full attention layers and 4 sliding 
+    3. (not implemented yet) A model with 2 full attention layers and 4 sliding
     window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
     """
     groups: list[list[str]]
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 5a2a1c30a9d5..fcb4d4f5a25a 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -21,15 +21,22 @@
 class StatLoggerBase(ABC):
 
     @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: Optional[IterationStats]):
         ...
 
+    def log(self):  # noqa
+        pass
+
 
 class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self):
         self._reset(time.monotonic())
+        self.last_scheduler_stats = SchedulerStats()
+        # Prefix cache metrics. This cannot be reset.
+        # TODO: Make the interval configurable.
+        self.prefix_caching_metrics = PrefixCachingMetrics()
 
     def _reset(self, now):
         self.last_log_time = now
@@ -38,14 +45,6 @@ def _reset(self, now):
         self.num_prompt_tokens: list[int] = []
         self.num_generation_tokens: list[int] = []
 
-        # Prefix cache metrics. TODO: Make the interval configurable.
-        self.prefix_caching_metrics = PrefixCachingMetrics()
-
-    def _local_interval_elapsed(self, now: float) -> bool:
-        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
-        elapsed_time = now - self.last_log_time
-        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
-
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
@@ -56,24 +55,27 @@ def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
         # Compute summary metrics for tracked stats
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: Optional[IterationStats]):
         """Log Stats to standard output."""
 
-        self._track_iteration_stats(iteration_stats)
+        if iteration_stats:
+            self._track_iteration_stats(iteration_stats)
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
-        now = time.monotonic()
-        if not self._local_interval_elapsed(now):
-            return
+        self.last_scheduler_stats = scheduler_stats
 
+    def log(self):
+        now = time.monotonic()
         prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
         generation_throughput = self._get_throughput(
             self.num_generation_tokens, now)
 
         self._reset(now)
 
+        scheduler_stats = self.last_scheduler_stats
+
         # Format and print output.
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
@@ -105,6 +107,9 @@ def __init__(self, vllm_config: VllmConfig):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        #
+        # Scheduler state
+        #
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",
             documentation="Number of requests in model execution batches.",
@@ -115,6 +120,9 @@ def __init__(self, vllm_config: VllmConfig):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # GPU cache
+        #
         self.gauge_gpu_cache_usage = prometheus_client.Gauge(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
@@ -132,6 +140,9 @@ def __init__(self, vllm_config: VllmConfig):
             "GPU prefix cache hits, in terms of number of cached blocks.",
             labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # Counters
+        #
         self.counter_num_preempted_reqs = prometheus_client.Counter(
             name="vllm:num_preemptions_total",
             documentation="Cumulative number of preemption from the engine.",
@@ -158,6 +169,9 @@ def __init__(self, vllm_config: VllmConfig):
                 reason] = counter_request_success_base.labels(*(labelvalues +
                                                                 [str(reason)]))
 
+        #
+        # Histograms of counts
+        #
         self.histogram_num_prompt_tokens_request = \
             prometheus_client.Histogram(
                 name="vllm:request_prompt_tokens",
@@ -179,6 +193,31 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=build_cudagraph_buckets(vllm_config),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_max_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_max_num_generation_tokens",
+                documentation=
+                "Histogram of maximum number of requested generation tokens.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_n_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_params_n",
+                documentation="Histogram of the n request parameter.",
+                buckets=[1, 2, 5, 10, 20],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_max_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_params_max_tokens",
+                documentation="Histogram of the max_tokens request parameter.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        #
+        # Histogram of timing intervals
+        #
         self.histogram_time_to_first_token = \
             prometheus_client.Histogram(
                 name="vllm:time_to_first_token_seconds",
@@ -238,6 +277,9 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # LoRA metrics
+        #
         self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
         if vllm_config.lora_config is not None:
             self.labelname_max_lora = "max_lora"
@@ -254,6 +296,9 @@ def __init__(self, vllm_config: VllmConfig):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Cache config info metric
+        #
         self.log_metrics_info("cache_config", vllm_config.cache_config)
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
@@ -274,8 +319,8 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
             labelnames=metrics_info.keys()).labels(**metrics_info)
         info_gauge.set(1)
 
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: Optional[IterationStats]):
         """Log to prometheus."""
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
@@ -287,6 +332,9 @@ def log(self, scheduler_stats: SchedulerStats,
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if iteration_stats is None:
+            return
+
         self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs)
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
@@ -295,6 +343,11 @@ def log(self, scheduler_stats: SchedulerStats,
             iteration_stats.num_prompt_tokens + \
             iteration_stats.num_generation_tokens)
 
+        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
+            self.histogram_max_num_generation_tokens_request.observe(
+                max_gen_tokens)
+        for n_param in iteration_stats.n_params_iter:
+            self.histogram_n_request.observe(n_param)
         for ttft in iteration_stats.time_to_first_tokens_iter:
             self.histogram_time_to_first_token.observe(ttft)
         for tpot in iteration_stats.time_per_output_tokens_iter:
@@ -316,6 +369,8 @@ def log(self, scheduler_stats: SchedulerStats,
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
                 finished_request.num_generation_tokens)
+            self.histogram_max_tokens_request.observe(
+                finished_request.max_tokens_param)
 
         if self.gauge_lora_info is not None:
             running_lora_adapters = \
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index abdca95670e1..36317fc9aeee 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -66,6 +66,7 @@ class FinishedRequestStats:
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
+    max_tokens_param: Optional[int] = None
     queued_time: float = 0.0
     prefill_time: float = 0.0
     inference_time: float = 0.0
@@ -81,6 +82,8 @@ def __init__(self):
         self.num_prompt_tokens = 0
         self.num_preempted_reqs = 0
         self.finished_requests: list[FinishedRequestStats] = []
+        self.max_num_generation_tokens_iter: list[int] = []
+        self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
         self.time_per_output_tokens_iter: list[float] = []
         self.waiting_lora_adapters: dict[str, int] = {}
@@ -147,9 +150,11 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                 LoRARequestStates.scheduled_request(lora_stats, req_id)
             elif event.type == EngineCoreEventType.PREEMPTED:
                 self.num_preempted_reqs += 1
+                LoRARequestStates.preempted_request(lora_stats, req_id)
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      num_prompt_tokens: int,
+                                     max_tokens_param: Optional[int],
                                      req_stats: RequestStateStats):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
@@ -173,6 +178,7 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
                                  e2e_latency=e2e_latency,
                                  num_prompt_tokens=num_prompt_tokens,
                                  num_generation_tokens=req_stats.num_generation_tokens,
+                                 max_tokens_param=max_tokens_param,
                                  queued_time=queued_time,
                                  prefill_time=prefill_time,
                                  inference_time=inference_time,
@@ -219,6 +225,13 @@ def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
         lora_stats.waiting_requests.remove(request_id)
         lora_stats.running_requests.add(request_id)
 
+    @staticmethod
+    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
+        if lora_stats is None:
+            return
+        lora_stats.running_requests.remove(request_id)
+        lora_stats.waiting_requests.add(request_id)
+
     def update_iteration_stats(self,
                                iteration_stats: Optional[IterationStats]):
         if iteration_stats is None:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index dc3ad402e066..edae654b5d33 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -80,3 +80,13 @@ class ModelRunnerOutput:
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
     prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
+
+
+EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+    req_ids=[],
+    req_id_to_index={},
+    sampled_token_ids=[],
+    spec_token_ids=None,
+    logprobs=None,
+    prompt_logprobs_dict={},
+)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 99df54734836..29609d313306 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,13 +3,15 @@
 import enum
 from typing import TYPE_CHECKING, Optional, Union
 
-from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
+from vllm.v1.structured_output.request import StructuredOutputRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+
+    from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
     from vllm.multimodal.inputs import PlaceholderRange
 
@@ -27,15 +29,19 @@ def __init__(
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional["LoRARequest"] = None,
+        structured_output_request: Optional["StructuredOutputRequest"] = None,
     ) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
+        self.structured_output_request = structured_output_request
 
-        self.status = RequestStatus.WAITING
+        self.status = (RequestStatus.WAITING_FOR_FSM
+                       if sampling_params.guided_decoding is not None else
+                       RequestStatus.WAITING)
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
         assert sampling_params.max_tokens is not None
@@ -78,6 +84,8 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
             lora_request=request.lora_request,
+            structured_output_request=StructuredOutputRequest(
+                sampling_params=request.sampling_params),
         )
 
     def queued(self, timestamp: Optional[float] = None) -> None:
@@ -134,18 +142,23 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def use_structured_output(self) -> bool:
+        return self.sampling_params.guided_decoding is not None
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
-    WAITING = 0
-    RUNNING = 1
-    PREEMPTED = 2
-    # Note: anything after PREEMPTED (2) will be considered
+    WAITING = enum.auto()
+    WAITING_FOR_FSM = enum.auto()
+    RUNNING = enum.auto()
+    PREEMPTED = enum.auto()
+    # Note: anything after PREEMPTED will be considered
     # as a finished status.
-    FINISHED_STOPPED = 3
-    FINISHED_LENGTH_CAPPED = 4
-    FINISHED_ABORTED = 5
-    FINISHED_IGNORED = 6
+    FINISHED_STOPPED = enum.auto()
+    FINISHED_LENGTH_CAPPED = enum.auto()
+    FINISHED_ABORTED = enum.auto()
+    FINISHED_IGNORED = enum.auto()
 
     @staticmethod
     def is_finished(status: "RequestStatus") -> bool:
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 55d9739b8007..e97e1235fb36 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -38,3 +38,6 @@ class SamplingMetadata:
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
     allowed_token_ids_mask: Optional[torch.Tensor]
+
+    # req_index -> bad_words_token_ids
+    bad_words_token_ids: dict[int, list[list[int]]]
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
new file mode 100644
index 000000000000..2984d4e4806f
--- /dev/null
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+_SMALLEST_LOGIT = float("-inf")
+
+
+def _apply_bad_words_single_batch(
+    logits: torch.Tensor,
+    bad_words_token_ids: list[list[int]],
+    past_tokens_ids: list[int],
+) -> None:
+    for bad_word_ids in bad_words_token_ids:
+        if len(bad_word_ids) > len(past_tokens_ids) + 1:
+            continue
+
+        prefix_length = len(bad_word_ids) - 1
+        last_token_id = bad_word_ids[-1]
+        if prefix_length > 0:
+            actual_prefix = past_tokens_ids[-prefix_length:]
+        else:
+            actual_prefix = []
+        expected_prefix = bad_word_ids[:prefix_length]
+
+        assert len(actual_prefix) == len(expected_prefix)
+
+        if actual_prefix == expected_prefix:
+            logits[last_token_id] = _SMALLEST_LOGIT
+
+
+def apply_bad_words(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+) -> None:
+    for i, bad_words_ids in bad_words_token_ids.items():
+        _apply_bad_words_single_batch(logits[i], bad_words_ids,
+                                      past_tokens_ids[i])
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index b0eb533ae2e5..96f6d807b10c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -6,6 +6,7 @@
 
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.bad_words import apply_bad_words
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
@@ -38,6 +39,8 @@ def forward(
         logits = logits.to(torch.float32)
         # Apply allowed token ids.
         logits = self.apply_allowed_token_ids(logits, sampling_metadata)
+        # Apply bad words exclusion.
+        logits = self.apply_bad_words(logits, sampling_metadata)
         # Apply logits bias.
         logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -237,3 +240,16 @@ def apply_allowed_token_ids(
             logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
                                 float("-inf"))
         return logits
+
+    def apply_bad_words(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if sampling_metadata.bad_words_token_ids:
+            apply_bad_words(
+                logits,
+                sampling_metadata.bad_words_token_ids,
+                sampling_metadata.output_token_ids,
+            )
+        return logits
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
new file mode 100644
index 000000000000..0c2e0ac2aa73
--- /dev/null
+++ b/vllm/v1/structured_output/__init__.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import copy
+import multiprocessing
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Optional
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
+                                               StructuredOutputOptions)
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+    import xgrammar as xgr
+
+    from vllm.v1.request import Request
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputManager:
+
+    def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+        self.vllm_config = vllm_config
+
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=self.vocab_size)
+        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+
+        self.max_cache_size = max_cache_size
+        self.request_key_to_grammar: OrderedDict[StructuredOutputKey,
+                                                 Grammar] = OrderedDict()
+
+        # The default max_workers if not specified is the number of CPUs * 5,
+        # which is way too high since these tasks are CPU-bound, not I/O bound.
+        # We also know we would never dominate CPU usage with just grammar
+        # compilation, so we set it to half the number of CPUs.
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._grammar_bitmask = xgr.allocate_token_bitmask(
+            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+
+    def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
+        # We need to pop and re-insert the grammar here for LRU cache
+        # of request_key_to_grammar
+        if key in self.request_key_to_grammar:
+            # Move accessed item to the end (most recently used)
+            value = self.request_key_to_grammar.pop(key)
+            if value is not None:
+                self.request_key_to_grammar[key] = value
+            return value
+        return None
+
+    def populate_cache(self, request: Request) -> None:
+        if request.structured_output_request is None:
+            return
+
+        grammar = self.request_key_to_grammar.get(
+            request.structured_output_request.structured_output_key)
+        if grammar:
+            request.structured_output_request.grammar = copy.copy(grammar)
+            return
+        request.structured_output_request.grammar = self.cache(request)
+
+    def cache(self, request: Request):
+        return self.executor.submit(self._executor_loop, request)
+
+    def _executor_loop(self, request: Request) -> Grammar:
+        # NOTE: The structured_output_request should never be
+        # None in this case, but mypy can't infer this
+        # correctly, so we need to ignore the error here.
+        key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
+        grammar = self.request_key_to_grammar.get(key)
+        if grammar is not None:
+            return copy.copy(grammar)
+        grammar = self.initialize_grammar(key)
+        # If cache is full, remove the least recently used item
+        if len(self.request_key_to_grammar) >= self.max_cache_size:
+            self.request_key_to_grammar.popitem(last=False)
+        self.request_key_to_grammar[key] = grammar
+        return copy.copy(grammar)
+
+    def initialize_grammar(self, key: StructuredOutputKey) -> Grammar:
+        # Note that the request was validated in the engine core client,
+        # so at this point we know it is a supported type of request.
+        #
+        # TODO: we still need to handle xgrammar compilation failures
+        request_type, grammar_spec = key
+
+        if request_type == StructuredOutputOptions.JSON:
+            # TODO -- allow any_whitespace to be configurable
+            # pending merge of https://github.com/vllm-project/vllm/pull/12744
+            ctx = self.compiler.compile_json_schema(grammar_spec,
+                                                    any_whitespace=False)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_builtin_json_grammar()
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        else:
+            logger.error("Validation should have already occurred. "
+                         "Please file an issue.")
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        return Grammar(
+            matcher=xgr.GrammarMatcher(ctx),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def grammar_bitmask(
+        self,
+        requests: dict[str, Request],
+        structured_output_request_ids: dict[str, int],
+        batch_len: int,
+    ) -> Optional[npt.NDArray[np.int32]]:
+        # Prepare the structured output bitmask for this batch.
+        if not structured_output_request_ids:
+            return None
+
+        # Fill the bitmask using the index of each request equal to its
+        # position in the batch. Resize the bitmask down to the size of
+        # the batch.
+        bitmask_tensor = self._grammar_bitmask
+        for req_id, batch_index in structured_output_request_ids.items():
+            request = requests[req_id].structured_output_request
+            assert request is not None and request.grammar is not None
+            if not request.grammar.matcher.is_terminated():
+                request.grammar.fill_bitmask(bitmask_tensor, batch_index)
+        if batch_len < self._grammar_bitmask.shape[0]:
+            bitmask_tensor = self._grammar_bitmask[:batch_len]
+
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
diff --git a/vllm/v1/structured_output/grammar.py b/vllm/v1/structured_output/grammar.py
new file mode 100644
index 000000000000..0e9b2b172261
--- /dev/null
+++ b/vllm/v1/structured_output/grammar.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+
+
+StructuredOutputKey = tuple[StructuredOutputOptions, str]
+
+
+@dataclass
+class Grammar:
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # TODO: support max_rollback_tokens
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.", request_id, token)
+                return False
+            self.num_processed_tokens += 1
+        return True
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> bool:
+        return self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
+
+    def __copy__(self):
+        return Grammar(
+            matcher=xgr.GrammarMatcher(self.ctx),
+            vocab_size=self.vocab_size,
+            ctx=self.ctx,
+        )
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
new file mode 100644
index 000000000000..fbcfd541df54
--- /dev/null
+++ b/vllm/v1/structured_output/request.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import dataclasses
+import functools
+import json
+from concurrent.futures import Future
+from concurrent.futures._base import TimeoutError
+from typing import Optional, Union, cast
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
+                                               StructuredOutputOptions)
+
+
+@dataclasses.dataclass
+class StructuredOutputRequest:
+
+    sampling_params: SamplingParams
+    _grammar: Optional[Union[Future[Grammar], Grammar]] = None
+
+    def _check_grammar_completion(self) -> bool:
+        # NOTE: We have to lazy import to gate circular imports
+        from vllm.v1.request import RequestStatus
+
+        if isinstance(self._grammar, Future):
+            try:
+                # We will check whether the future is ready within 100 us
+                self._grammar = self._grammar.result(timeout=0.0001)
+                self.status = RequestStatus.WAITING
+            except TimeoutError:
+                return False
+        return True
+
+    @property
+    def is_grammar_ready(self) -> bool:
+        return self._check_grammar_completion()
+
+    @property
+    def grammar(self) -> Optional[Grammar]:
+        completed = self._check_grammar_completion()
+        return cast(Optional[Grammar], self._grammar) if completed else None
+
+    @grammar.setter
+    def grammar(self, grammar: Union[Grammar, Future[Grammar]]) -> None:
+        self._grammar = grammar
+
+    @functools.cached_property
+    def structured_output_key(self) -> StructuredOutputKey:
+        params = self.sampling_params.guided_decoding
+        assert params is not None, "params can't be None."
+        if params.json is not None:
+            if not isinstance(params.json, str):
+                json_str = json.dumps(params.json)
+            else:
+                json_str = params.json
+            return (StructuredOutputOptions.JSON, json_str)
+        elif params.json_object:
+            return (StructuredOutputOptions.JSON_OBJECT, "")
+        elif params.regex is not None:
+            return (StructuredOutputOptions.REGEX, params.regex)
+        elif params.choice is not None:
+            if not isinstance(params.choice, str):
+                json_str = json.dumps(params.choice)
+            else:
+                json_str = params.choice
+            return (StructuredOutputOptions.CHOICE, json_str)
+        elif params.grammar is not None:
+            return (StructuredOutputOptions.GRAMMAR, params.grammar)
+        else:
+            raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
new file mode 100644
index 000000000000..7b1adb834e74
--- /dev/null
+++ b/vllm/v1/structured_output/utils.py
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import re
+from typing import TYPE_CHECKING, Any
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for enum restrictions
+        if "enum" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj
+                for key in ("minimum", "maximum", "exclusiveMinimum",
+                            "exclusiveMaximum", "multipleOf")):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj
+                for key in ("uniqueItems", "contains", "minContains",
+                            "maxContains", "minItems", "maxItems")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and any(
+                key in obj for key in ("minLength", "maxLength", "format")):
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+
+    Args:
+        grammar_str: Input grammar string
+
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for EBNF rule definition
+        if '::=' in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_ebnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to EBNF format.
+
+    EBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+
+    Args:
+        grammar_str: Input grammar in Lark format
+
+    Returns:
+        str: Converted grammar in EBNF format
+
+    Examples:
+        >>> print(convert_lark_to_ebnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
+
+
+def choice_as_grammar(choice: list[str]) -> str:
+
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r'\\\1', s)
+
+    escaped_choices = (escape_ebnf_string(c) for c in choice)
+    grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+    return grammar
+
+
+def validate_structured_output_request(
+        sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        raise ValueError("Regex structured output is not supported.")
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b0b218d92b92..9707cb5774cd 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -10,6 +10,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
@@ -199,9 +200,14 @@ def __init__(
         self.logit_bias: list[Optional[dict[int,
                                             float]]] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
         # This is updated each time the batch constituents change.
@@ -262,9 +268,12 @@ def add_request(
         self.top_p_cpu[req_index] = sampling_params.top_p
         if sampling_params.top_p < 1:
             self.top_p_reqs.add(req_id)
-        self.top_k_cpu[req_index] = sampling_params.top_k
-        if sampling_params.top_k > 0:
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
             self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
         self.min_p_cpu[req_index] = sampling_params.min_p
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
@@ -300,6 +309,7 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
                 self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
                                                           self.vocab_size,
                                                           dtype=torch.bool,
@@ -309,8 +319,13 @@ def add_request(
                     self.vocab_size,
                     dtype=torch.bool,
                     device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = True
+                sampling_params.allowed_token_ids] = False
+
+        self.bad_words_token_ids[
+            req_index] = sampling_params.bad_words_token_ids
 
         # Add request lora ID
         if request.lora_request:
@@ -359,7 +374,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
@@ -374,8 +391,6 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
         self.num_tokens[i1], self.num_tokens[i2] =\
             self.num_tokens[i2], self.num_tokens[i1]
-        self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
-            self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
             self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
         self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
@@ -397,24 +412,29 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.min_p_cpu[i1], self.min_p_cpu[i2] =\
             self.min_p_cpu[i2], self.min_p_cpu[i1]
 
-        g1 = self.generators.get(i1)
-        g2 = self.generators.get(i2)
-        if g1 is not None:
-            self.generators[i2] = g1
-        if g2 is not None:
-            self.generators[i1] = g2
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
 
-        t1 = self.min_tokens.get(i1)
-        t2 = self.min_tokens.get(i2)
-        if t1 is not None:
-            self.min_tokens[i2] = t1
-        if t2 is not None:
-            self.min_tokens[i1] = t2
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
         self.logit_bias[i1], self.logit_bias[i2] =\
             self.logit_bias[i2], self.logit_bias[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> None:
@@ -488,6 +508,10 @@ def condense(self, empty_req_indices: list[int]) -> None:
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
                         last_req_index]
 
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -555,6 +579,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4a1fb0514c3f..57b05908a643 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -17,6 +17,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -24,13 +25,15 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, LazyLoader, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -39,7 +42,11 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
+    import xgrammar as xgr
+
     from vllm.v1.core.scheduler_output import SchedulerOutput
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
 
 logger = init_logger(__name__)
 
@@ -455,8 +462,10 @@ def _prepare_inputs(
         # Some attention backends (namely MLA) may want to separate requests
         # based on if the attention computation will be compute-bound or
         # memory-bound. This gives them a hook to do that.
-        self.attn_metadata_builder.reorder_batch(self.input_batch,
-                                                 scheduler_output)
+        modified_batch = self.attn_metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
+        if modified_batch:
+            self.input_batch.refresh_sampling_metadata()
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
@@ -857,6 +866,53 @@ def _gather_encoder_outputs(
     def get_model(self) -> nn.Module:
         return self.model
 
+    def apply_grammar_bitmask(
+        self,
+        scheduler_output: "SchedulerOutput",
+        logits: torch.Tensor,
+    ):
+        # Serialization of np.ndarray is much more efficient than a tensor,
+        # so we receive it in that format.
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        if grammar_bitmask is None:
+            return
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the gpu runner is
+        # ordering the requests in the batch. We need to sort the bitmask to
+        # match the order of the requests used here.
+        struct_out_req_batch_indices: dict[str, int] = {}
+        indices_match = True
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                # not a structured output request
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            if batch_index != mask_index:
+                indices_match = False
+            struct_out_req_batch_indices[req_id] = batch_index
+
+        if not indices_match:
+            # Sort the bitmask to match the order of the requests
+            sorted_bitmask = np.zeros_like(grammar_bitmask)
+            for req_id, batch_index in struct_out_req_batch_indices.items():
+                orig_index = scheduler_output.structured_output_request_ids[
+                    req_id]
+                sorted_bitmask[batch_index] = grammar_bitmask[orig_index]
+            grammar_bitmask = sorted_bitmask
+
+        grammar_bitmask = torch.from_numpy(grammar_bitmask)
+
+        # TODO: compatibility with spec decode
+        xgr.apply_token_bitmask_inplace(
+            logits,
+            grammar_bitmask.to(self.device, non_blocking=True),
+            indices=list(struct_out_req_batch_indices.values()),
+        )
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -864,6 +920,9 @@ def execute_model(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
         self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
@@ -942,6 +1001,10 @@ def execute_model(
         sample_hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
 
+        # Apply structured output bitmasks if present
+        if scheduler_output.grammar_bitmask is not None:
+            self.apply_grammar_bitmask(scheduler_output, logits)
+
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if not self.use_spec_decode:
@@ -1010,7 +1073,7 @@ def execute_model(
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids)
 
-        model_runner_output = ModelRunnerOutput(
+        return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
@@ -1018,7 +1081,6 @@ def execute_model(
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
-        return model_runner_output
 
     def generate_draft_token_ids(
         self,
@@ -1313,6 +1375,7 @@ def profile_run(self) -> None:
                     min_tokens={},
                     logit_bias=[None for _ in range(num_reqs)],
                     allowed_token_ids_mask=None,
+                    bad_words_token_ids={},
                 )
                 sampler_output = self.model.sample(
                     logits=logits, sampling_metadata=dummy_metadata)
@@ -1357,7 +1420,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV 
+            kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
         if len(kv_cache_config.groups) > 1:
@@ -1389,19 +1452,23 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
     def get_kv_cache_spec(self) -> KVCacheSpec:
         """
-        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
         Returns:
-            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
 
         forward_ctx = self.vllm_config.compilation_config.static_forward_context
         block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: KVCacheSpec = {}
         for layer_name, attn_module in forward_ctx.items():
+            if isinstance(attn_module, FusedMoE):
+                continue
+
             # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention, MLA.
+            # cross-attention
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
                 kv_cache_spec[layer_name] = FullAttentionSpec(
@@ -1409,7 +1476,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
                     dtype=attn_module.dtype,
-                )
+                    use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 104e5a3dcfc5..00869467be34 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -14,18 +14,22 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import set_forward_context
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
+from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               NUM_QUERIES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -84,6 +88,28 @@ def __init__(
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
 
+        # Multi-modal data support
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.uses_mrope = model_config.uses_mrope
+        # TODO: Support M-RoPE (e.g, Qwen2-VL)
+        assert not self.uses_mrope, "TPU does not support M-RoPE yet."
+
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+
+        # Lazy initialization
+        # self.model: nn.Module  # Set after load_model
+        self.kv_caches: list[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+
+        # Request states.
+        self.requests: dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
@@ -91,18 +117,9 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
+            vocab_size=model_config.get_vocab_size(),
         )
 
-        # Request states.
-        self.requests: dict[str, CachedRequestState] = {}
-
-        # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
-
-        # KV caches for forward pass
-        self.kv_caches: list[tuple[torch.Tensor, torch.Tensor]] = []
-
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
         # Sometimes the numpy op is faster so we create both.
@@ -121,16 +138,8 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
-        # self.input_batch.block_table has a shape of [max_num_reqs,
-        # max_num_blocks_per_req]. To reduce the number of recompilation,
-        # we want the block_table.shape[0] to be num_tokens.
-        # To make the block_table to be compatible with the paged attention
-        # kernel, we want the block_table[1] to be multiple of
-        # NUM_KV_PAGES_PER_BLOCK.
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -164,6 +173,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -177,6 +187,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             if req_index is not None:
                 removed_req_indices.append(req_index)
 
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
+
         # Remove the unscheduled requests from the persistent batch.
         # NOTE(woosuk): The unscheduled requests are either preempted requests
         # or running requests that are not scheduled in this step. We remove
@@ -274,10 +292,10 @@ def get_model(self) -> nn.Module:
 
     def get_kv_cache_spec(self) -> KVCacheSpec:
         """
-        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
         Returns:
-            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
 
@@ -294,6 +312,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
                     dtype=attn_module.dtype,
+                    use_mla=False,
                 )
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
@@ -382,14 +401,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         self.query_start_loc_np[0] = 0
         np.cumsum(num_scheduled_tokens_per_req,
                   out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[num_reqs + 1:] = 1
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens_per_req)
 
         # Do the padding and copy the tensors to the TPU.
-        padded_total_num_scheduled_tokens = _get_padded_number(
-            total_num_scheduled_tokens, NUM_QUERIES_PER_BLOCK)
+        padded_total_num_scheduled_tokens = _get_padded_token_len(
+            total_num_scheduled_tokens)
         self.input_ids = self.input_ids_cpu[:
                                             padded_total_num_scheduled_tokens].to(
                                                 self.device)
@@ -400,57 +420,172 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         slot_mapping = self.slot_mapping_cpu[:
                                              padded_total_num_scheduled_tokens].to(
                                                  self.device)
-        padded_block_table = self.block_table_cpu[:
-                                                  padded_total_num_scheduled_tokens]
-        padded_block_table[:num_reqs, :self.max_num_blocks_per_req] = (
+        block_tables = self.block_table_cpu[:self.max_num_reqs]
+        block_tables[:num_reqs, :self.max_num_blocks_per_req] = (
             self.input_batch.block_table.get_cpu_tensor()[:num_reqs])
-        padded_block_table = padded_block_table.to(self.device)
-        query_start_loc = self.query_start_loc_cpu[:
-                                                   padded_total_num_scheduled_tokens
-                                                   + 1].to(self.device)
-        seq_lens = self.seq_lens_cpu[:padded_total_num_scheduled_tokens].to(
+        block_tables = block_tables.to(self.device)
+        query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to(
             self.device)
+        seq_lens = self.seq_lens_cpu[:self.max_num_reqs].to(self.device)
 
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
-            block_tables=padded_block_table,
+            block_tables=block_tables,
             context_lens=seq_lens,
             query_start_loc=query_start_loc,
-            num_seqs=num_reqs,
+            num_seqs=torch.tensor([num_reqs],
+                                  dtype=torch.int32,
+                                  device=self.device),
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
         # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
+        padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
+            num_reqs, self.max_num_reqs)
+        logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
+        logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+            for input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[input_id])
+                req_input_ids.append((req_id, input_id))
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                           device=self.device)
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_encoder_outputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[torch.Tensor]:
+        encoder_outputs: list[torch.Tensor] = []
+        for req_id in self.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
+
     @torch.no_grad()
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> ModelRunnerOutput:
         # Update cached state
         self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
-        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    self.input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(self.input_ids)
+            input_ids = None
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids
+            inputs_embeds = None
 
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                token_ids=self.input_ids,
-                position_ids=self.position_ids,
+                input_ids=input_ids,
+                positions=self.position_ids,
                 kv_caches=self.kv_caches,
+                inputs_embeds=inputs_embeds,
             )
-        hidden_states = hidden_states[:total_num_scheduled_tokens]
         num_reqs = self.input_batch.num_reqs
-        logits_indices = logits_indices[:num_reqs]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
-        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        selected_token_ids = self.model.compute_logits(hidden_states,
+                                                       logits_indices, None)
+        selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
         # Then, let's update the cache state.
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
@@ -538,49 +673,85 @@ def load_model(self) -> None:
                                    fullgraph=True,
                                    dynamic=False)
 
-    def dummy_run(
+    def _dummy_run(
         self,
         kv_caches,
         num_tokens: int,
     ) -> None:
-        input_ids = torch.zeros(num_tokens,
-                                dtype=torch.int32,
-                                device=self.device)
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
+                                        dtype=self.dtype,
+                                        device=self.device)
+        else:
+            input_ids = torch.zeros((num_tokens),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            inputs_embeds = None
+        actual_num_reqs = min(num_tokens, self.max_num_reqs)
         position_ids = torch.zeros(num_tokens,
                                    dtype=torch.int32,
                                    device=self.device)
         slot_mapping = torch.zeros(num_tokens,
                                    dtype=torch.int64,
                                    device=self.device)
-        block_tables = torch.zeros((num_tokens, self.block_table_cpu.shape[1]),
-                                   dtype=torch.int32,
-                                   device=self.device)
-        query_lens = [1] * num_tokens
+        block_tables = torch.zeros(
+            (self.max_num_reqs, self.block_table_cpu.shape[1]),
+            dtype=torch.int32,
+            device=self.device)
+        query_lens = [1] * self.max_num_reqs
         query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                                     dtype=torch.int32),
                                        dim=0,
                                        dtype=torch.int32).to(self.device)
-        context_lens = torch.ones((num_tokens, ),
+        context_lens = torch.ones((self.max_num_reqs, ),
                                   dtype=torch.int32,
                                   device=self.device)
+        num_seqs = torch.tensor([actual_num_reqs],
+                                dtype=torch.int32,
+                                device=self.device)
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
             block_tables=block_tables,
             context_lens=context_lens,
             query_start_loc=query_start_loc,
-            num_seqs=num_tokens,
+            num_seqs=num_seqs,
         )
 
-        torch._dynamo.mark_dynamic(input_ids, 0)
+        if self.is_multimodal_model:
+            torch._dynamo.mark_dynamic(inputs_embeds, 0)
+        else:
+            torch._dynamo.mark_dynamic(input_ids, 0)
         torch._dynamo.mark_dynamic(position_ids, 0)
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
-            self.model(input_ids, position_ids, kv_caches)
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=position_ids,
+                kv_caches=kv_caches,
+                inputs_embeds=inputs_embeds,
+            )
+            num_reqs = _get_padded_num_reqs_with_upper_limit(
+                64, self.max_num_reqs)
+            # NOTE(chengjiyao): In total, the compute_logits function utilizes a
+            # compilation cache size of token_bucket_num multiplied by
+            # req_bucket_num. This is acceptable, given the graph's relatively
+            # small size.
+            while True:
+                logits_indices = torch.zeros(
+                    num_reqs,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                torch._dynamo.mark_dynamic(hidden_states, 0)
+                torch._dynamo.mark_dynamic(logits_indices, 0)
+                self.model.compute_logits(hidden_states, logits_indices, None)
+                if num_reqs >= self.max_num_reqs:
+                    break
+                num_reqs = _get_padded_num_reqs_with_upper_limit(
+                    num_reqs + 1, self.max_num_reqs)
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -590,11 +761,11 @@ def capture_model(self) -> None:
         start = time.perf_counter()
         num_tokens = 16
         while True:
-            self.dummy_run(self.kv_caches, num_tokens)
+            self._dummy_run(self.kv_caches, num_tokens)
             logger.info("  -- num_tokens: %d", num_tokens)
             xm.mark_step()
             xm.wait_device_ops()
-            if num_tokens >= self.scheduler_config.max_num_batched_tokens:
+            if num_tokens >= self.max_num_tokens:
                 break
             num_tokens *= 2
         end = time.perf_counter()
@@ -604,7 +775,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV 
+            kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
         if len(kv_cache_config.groups) > 1:
@@ -647,58 +818,60 @@ def __init__(self, model: nn.Module):
 
     def forward(
         self,
-        token_ids: torch.Tensor,
-        position_ids: torch.Tensor,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
         kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
         Args:
-            token_ids: The input token IDs of shape [num_tokens].
-            position_ids: The input position IDs of shape [num_tokens].
+            input_ids: The input token IDs of shape [num_tokens].
+            positions: The input position IDs of shape [num_tokens].
             kv_caches: The key and value caches. They can be None during the
                 memory profiling at initialization.
+            inputs_embeds: The input embeddings of shape [num_tokens,
+                hidden_size]. It is used for multimodal models.
         """
-        # Skip this in memory profiling at initialization.
-        if kv_caches[0][0].numel() > 0:
-            attn_metadata = get_forward_context().attn_metadata
-            # index_copy_(slot_mapping) only works when the inserted dimension
-            # is 0. However, the KV cache in the Pallas backend has the shape
-            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
-            # work, we need to flatten the first three dimensions and modify
-            # the slot_mapping accordingly.
-            # kv_caches: list[tuple[torch.Tensor, torch.Tensor]]
-            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
-            slot_mapping = attn_metadata.slot_mapping
-            slot_mapping = slot_mapping.flatten()
-            head_indicies = torch.arange(0,
-                                         num_kv_heads,
-                                         device=slot_mapping.device,
-                                         dtype=slot_mapping.dtype)
-            head_indicies *= block_size * num_blocks
-            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
-                -1, num_kv_heads)
-            slot_mapping = slot_mapping + head_indicies.view(1, -1)
-            slot_mapping = slot_mapping.flatten()
-            attn_metadata.slot_mapping = slot_mapping
 
         assert self.model is not None
         hidden_states = self.model(
-            token_ids,
-            position_ids,
-            kv_caches,
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
         )
 
         return hidden_states
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
+        logits_indices: torch.Tensor,
         sampling_metadata,
     ) -> Optional[torch.Tensor]:
+        hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
-        return logits
+        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        return selected_token_ids
+
+    def get_multimodal_embeddings(self, *args, **kwargs):
+        return self.model.get_multimodal_embeddings(*args, **kwargs)
+
+    def get_input_embeddings(self, *args, **kwargs):
+        return self.model.get_input_embeddings(*args, **kwargs)
 
 
 def _get_padded_number(n: int, multiple: int) -> int:
     return ((n + multiple - 1) // multiple) * multiple
+
+
+def _get_padded_token_len(x: int) -> int:
+    if x <= 16:
+        return 16
+    return 1 << (x - 1).bit_length()
+
+
+def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
+    res = 64 if x <= 64 else 1 << (x - 1).bit_length()
+    return min(res, upper_limit)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index cbd2fe6edd81..9f5956119275 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -7,6 +7,7 @@
 import torch.distributed
 import torch.nn as nn
 import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -65,6 +66,18 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        self.profiler = None
+        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+            # For TPU, we can only have 1 active profiler session for 1 profiler
+            # server. So we only profile on rank0.
+            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        self.profile_dir)
+            self.profiler = xp.start_server(9012)
+
+        if self.model_config.seed is None:
+            self.model_config.seed = 0
+
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         torch.set_grad_enabled(False)
@@ -83,7 +96,8 @@ def init_device(self):
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
-        xm.set_rng_state(self.model_config.seed, self.device)
+        if self.model_config.seed is not None:
+            xm.set_rng_state(self.model_config.seed, self.device)
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
@@ -124,7 +138,7 @@ def determine_available_memory(self) -> int:
             self.vllm_config.compilation_config.static_forward_context,
             runner_kv_caches)
 
-        self.model_runner.dummy_run(
+        self.model_runner._dummy_run(
             runner_kv_caches,
             num_tokens=self.scheduler_config.max_num_batched_tokens,
         )
@@ -152,6 +166,15 @@ def execute_model(
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.is_driver_worker else None
 
+    def profile(self, is_start: bool = True):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            if is_start:
+                xp.start_trace(self.profile_dir)
+            else:
+                xp.stop_trace()
+
     def load_model(self) -> None:
         self.model_runner.load_model()
 
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 3960392cf74e..004b4e4b757f 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """CacheEngine class for managing the KV cache."""
+from math import prod
 from typing import List
 
-import numpy as np
 import torch
 
 from vllm import envs
@@ -90,7 +90,7 @@ def _allocate_kv_cache(
             # NOTE this assumption currently only holds for MLA so we only apply
             # this optimization when `use_mla` is true
             entry_shape = kv_cache_shape[2:]
-            entry_size = np.prod(entry_shape)
+            entry_size = prod(entry_shape)
             alloc_entry_size = align_to_256bytes(entry_size, self.dtype)
             alloc_shape = (*kv_cache_shape[:2], alloc_entry_size)
         else:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 02d261fd7251..4f0485a35651 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1111,41 +1111,41 @@ def load_model(self) -> None:
         with DeviceMemoryProfiler(self.device) as m:
             time_before_load = time.perf_counter()
             self.model = get_model(vllm_config=self.vllm_config)
+            if self.lora_config:
+                assert supports_lora(
+                    self.model
+                ), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+                if supports_multimodal(self.model):
+                    logger.warning(
+                        "Regarding multimodal models, vLLM currently "
+                        "only supports adding LoRA to language model.")
+                # It's necessary to distinguish between the
+                # max_position_embeddings of VLMs and LLMs.
+                if hasattr(self.model.config, "max_position_embeddings"):
+                    max_pos_embeddings = (
+                        self.model.config.max_position_embeddings)
+                else:
+                    max_pos_embeddings = (
+                        self.model.config.text_config.max_position_embeddings)
+
+                self.lora_manager = LRUCacheWorkerLoRAManager(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens,
+                    self.vocab_size,
+                    self.lora_config,
+                    self.device,
+                    self.model.embedding_modules,
+                    self.model.embedding_padding_modules,
+                    max_position_embeddings=max_pos_embeddings,
+                )
+                self.model = self.lora_manager.create_lora_manager(self.model)
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GB and %.6f seconds",
                     self.model_memory_usage / float(2**30),
                     time_after_load - time_before_load)
-
-        if self.lora_config:
-            assert supports_lora(
-                self.model
-            ), f"{self.model.__class__.__name__} does not support LoRA yet."
-
-            if supports_multimodal(self.model):
-                logger.warning("Regarding multimodal models, vLLM currently "
-                               "only supports adding LoRA to language model.")
-            # It's necessary to distinguish between the max_position_embeddings
-            # of VLMs and LLMs.
-            if hasattr(self.model.config, "max_position_embeddings"):
-                max_pos_embeddings = self.model.config.max_position_embeddings
-            else:
-                max_pos_embeddings = (
-                    self.model.config.text_config.max_position_embeddings)
-
-            self.lora_manager = LRUCacheWorkerLoRAManager(
-                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens,
-                self.vocab_size,
-                self.lora_config,
-                self.device,
-                self.model.embedding_modules,
-                self.model.embedding_padding_modules,
-                max_position_embeddings=max_pos_embeddings,
-            )
-            self.model = self.lora_manager.create_lora_manager(self.model)
-
         if self.prompt_adapter_config:
             self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
                 self.scheduler_config.max_num_seqs,
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 5035ea20294c..aa1d2cbb2df2 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -346,6 +346,8 @@ def execute_model(
             input_tokens,
             "positions":
             input_positions,
+            "kv_caches":
+            kv_caches,
             **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
                                          device=self.device),
         }
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 7903e81943c2..1a5eaba09b94 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -93,6 +94,27 @@ def init_device(self) -> None:
                                      f"tp{world_size}_rank{rank}")
         xr.initialize_cache(per_rank_path, readonly=False)
 
+        self.profiler = None
+        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+            # For TPU, we can only have 1 active profiler session for 1 profiler
+            # server. So we only profile on rank0.
+            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        self.profile_dir)
+            self.profiler = xp.start_server(9012)
+
+    def start_profile(self):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            xp.start_trace(self.profile_dir)
+
+    def stop_profile(self):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            xp.stop_trace()
+
     def load_model(self):
         self.model_runner.load_model()
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7cc1562a5bce..e5662e69343c 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -558,10 +558,37 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
             worker_class = resolve_obj_by_qualname(
                 self.vllm_config.parallel_config.worker_cls)
         else:
+            logger.warning(
+                "passing worker_cls as a class object is strongly deprecated,"
+                " as the serialization of class objects can be tricky and"
+                " error-prone. To be safe, please keep the class in a separate"
+                " module and pass the qualified name of the class as a string."
+            )
             assert isinstance(self.vllm_config.parallel_config.worker_cls,
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
+        if self.vllm_config.parallel_config.worker_extension_cls:
+            worker_extension_cls = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_extension_cls)
+            extended_calls = []
+            if worker_extension_cls not in worker_class.__bases__:
+                # check any conflicts between worker and worker_extension_cls
+                for attr in dir(worker_extension_cls):
+                    if attr.startswith("__"):
+                        continue
+                    assert not hasattr(worker_class, attr), (
+                        f"Worker class {worker_class} already has an attribute"
+                        f" {attr}, which conflicts with the worker"
+                        f" extension class {worker_extension_cls}.")
+                    if callable(getattr(worker_extension_cls, attr)):
+                        extended_calls.append(attr)
+                # dynamically inherit the worker extension class
+                worker_class.__bases__ = worker_class.__bases__ + (
+                    worker_extension_cls, )
+                logger.info(
+                    "Injected %s into %s for extended collective_rpc calls %s",
+                    worker_extension_cls, worker_class, extended_calls)
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during worker initialization
             self.worker = worker_class(**kwargs)