-
-
Notifications
You must be signed in to change notification settings - Fork 10.7k
[Hardware][Intel] Add CPU inference backend #3634
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
32 commits
Select commit
Hold shift + click to select a range
46cd747
Add CPU CMake extension.
bigPYJ1151 4060d26
Integrate CPU module.
bigPYJ1151 4de2017
Refactor attention
bigPYJ1151 586295e
rebase
bigPYJ1151 528ddfc
Add Doc
bigPYJ1151 cc3265f
Refactor
bigPYJ1151 1851c27
Fix
bigPYJ1151 7a0506d
Fix comments
bigPYJ1151 3da36b5
make ruff happy
bigPYJ1151 26f416c
Fix doc
bigPYJ1151 1261df4
Fix isort
bigPYJ1151 0e2342d
Fix format
bigPYJ1151 4d94ca3
Refactor sdpa
bigPYJ1151 7f3dd1d
Fix doc
bigPYJ1151 3c32c38
Refine doc.
bigPYJ1151 1ca0210
Remove unsupported ops.
bigPYJ1151 ddb04da
Update Dockerfile.
bigPYJ1151 f025c54
Update doc
bigPYJ1151 0b03d96
Fix log
bigPYJ1151 010380e
warmup kv cache to reduce pagefault overhead.
bigPYJ1151 9a0eaf1
Fix
bigPYJ1151 8660b57
Update doc
bigPYJ1151 153e239
Fix comments.
bigPYJ1151 8a4dfd4
Fix doc
bigPYJ1151 fe99d3a
Revert forwarding DeviceConfig to CacheEngine.
bigPYJ1151 ef508a2
Fix comments.
bigPYJ1151 ca97811
make cpu-kvcache-space as env.
bigPYJ1151 a987411
minor fix
bigPYJ1151 42bb988
Fix
bigPYJ1151 32f9521
Add CI
bigPYJ1151 44df554
Add CI
bigPYJ1151 738bd74
Add CI
bigPYJ1151 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# This script build the CPU docker image and run the offline inference inside the container. | ||
# It serves a sanity check for compilation and basic model usage. | ||
set -ex | ||
|
||
# Try building the docker image | ||
docker build -t cpu-test -f Dockerfile.cpu . | ||
|
||
# Setup cleanup | ||
remove_docker_container() { docker rm -f cpu-test || true; } | ||
trap remove_docker_container EXIT | ||
remove_docker_container | ||
|
||
# Run the image and launch offline inference | ||
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-check cpu-test python3 examples/offline_inference.py |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. | ||
|
||
FROM ubuntu:22.04 | ||
|
||
RUN apt-get update -y \ | ||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ | ||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 | ||
|
||
RUN pip install --upgrade pip \ | ||
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy | ||
|
||
COPY ./ /workspace/vllm | ||
|
||
WORKDIR /workspace/vllm | ||
|
||
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu | ||
|
||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install | ||
|
||
CMD ["/bin/bash"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
|
||
# | ||
# Define environment variables for special configurations | ||
# | ||
if(DEFINED ENV{VLLM_CPU_AVX512BF16}) | ||
set(ENABLE_AVX512BF16 ON) | ||
endif() | ||
|
||
include_directories("${CMAKE_SOURCE_DIR}/csrc") | ||
|
||
# | ||
# Check the compile flags | ||
# | ||
list(APPEND CXX_COMPILE_FLAGS | ||
"-fopenmp" | ||
"-DVLLM_CPU_EXTENSION") | ||
|
||
execute_process(COMMAND cat /proc/cpuinfo | ||
RESULT_VARIABLE CPUINFO_RET | ||
OUTPUT_VARIABLE CPUINFO) | ||
|
||
if (NOT CPUINFO_RET EQUAL 0) | ||
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") | ||
endif() | ||
|
||
function (find_isa CPUINFO TARGET OUT) | ||
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) | ||
if(NOT ISA_FOUND EQUAL -1) | ||
set(${OUT} ON PARENT_SCOPE) | ||
else() | ||
set(${OUT} OFF PARENT_SCOPE) | ||
endif() | ||
endfunction() | ||
|
||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND) | ||
|
||
if (AVX512_FOUND) | ||
list(APPEND CXX_COMPILE_FLAGS | ||
"-mavx512f" | ||
"-mavx512vl" | ||
"-mavx512bw" | ||
"-mavx512dq") | ||
|
||
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND) | ||
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16) | ||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND | ||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) | ||
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16") | ||
else() | ||
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3") | ||
endif() | ||
else() | ||
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") | ||
endif() | ||
else() | ||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.") | ||
endif() | ||
|
||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") | ||
|
||
|
||
# | ||
# Define extension targets | ||
# | ||
|
||
# | ||
# _C extension | ||
# | ||
set(VLLM_EXT_SRC | ||
"csrc/cpu/activation.cpp" | ||
"csrc/cpu/attention.cpp" | ||
"csrc/cpu/cache.cpp" | ||
"csrc/cpu/layernorm.cpp" | ||
"csrc/cpu/pos_encoding.cpp" | ||
"csrc/cpu/pybind.cpp") | ||
|
||
define_gpu_extension_target( | ||
_C | ||
DESTINATION vllm | ||
LANGUAGE CXX | ||
SOURCES ${VLLM_EXT_SRC} | ||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS} | ||
WITH_SOABI | ||
) | ||
|
||
add_custom_target(default) | ||
message(STATUS "Enabling C extension.") | ||
add_dependencies(default _C) | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#include "cpu_types.hpp" | ||
|
||
namespace { | ||
template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &), | ||
bool is_gated> | ||
void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input, | ||
scalar_t *__restrict__ output) { | ||
using scalar_vec_t = vec_op::vec_t<scalar_t>; | ||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); | ||
|
||
TORCH_CHECK(d % VEC_ELEM_NUM == 0); | ||
|
||
#pragma omp parallel for | ||
for (int i = 0; i < num_tokens; ++i) { | ||
for (int j = 0; j < d; j += VEC_ELEM_NUM) { | ||
int start = i * d; | ||
if constexpr (is_gated) { | ||
start *= 2; | ||
} | ||
|
||
const scalar_vec_t x(input + start + j); | ||
const vec_op::FP32Vec8 f32_x(x); | ||
vec_op::FP32Vec8 f32_ans = func(f32_x); | ||
|
||
if constexpr (is_gated) { | ||
const scalar_vec_t y(input + start + d + j); | ||
const vec_op::FP32Vec8 f32_y(y); | ||
f32_ans = f32_y * f32_ans; | ||
} | ||
|
||
const scalar_vec_t result(f32_ans); | ||
result.save(output + i * d + j); | ||
} | ||
} | ||
} | ||
|
||
FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) { | ||
const vec_op::FP32Vec8 zeros(0.0); | ||
const vec_op::FP32Vec8 ones(1.0); | ||
return x / (ones + (zeros - x).exp()); | ||
} | ||
|
||
FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) { | ||
const vec_op::FP32Vec8 ones(1.0); | ||
const vec_op::FP32Vec8 w1(0.79788456f); | ||
const vec_op::FP32Vec8 w2(0.044715f); | ||
const vec_op::FP32Vec8 w3(0.5); | ||
const vec_op::FP32Vec8 x3 = x * x * x; | ||
const vec_op::FP32Vec8 t = (w1 * (x + w2 * x3)).tanh(); | ||
return w3 * x * (ones + t); | ||
} | ||
|
||
FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) { | ||
const vec_op::FP32Vec8 ones(1.0); | ||
const vec_op::FP32Vec8 w1(0.79788456f); | ||
const vec_op::FP32Vec8 w2(0.044715f); | ||
const vec_op::FP32Vec8 w3(0.5); | ||
const vec_op::FP32Vec8 t = (x * w1 * (ones + x * w2 * x)).tanh(); | ||
return w3 * x * (ones + t); | ||
} | ||
|
||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) { | ||
const vec_op::FP32Vec8 ones(1.0); | ||
const vec_op::FP32Vec8 w1(M_SQRT1_2); | ||
const vec_op::FP32Vec8 w2(0.5); | ||
return x * w2 * (ones + (x * w1).er()); | ||
} | ||
|
||
FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) { | ||
const vec_op::FP32Vec8 ones(1.0); | ||
const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5); | ||
const vec_op::FP32Vec8 w2(0.5); | ||
const vec_op::FP32Vec8 w3(0.044715); | ||
const vec_op::FP32Vec8 x_3 = x * x * x; | ||
const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3); | ||
return x * w2 * (ones + inner.tanh()); | ||
} | ||
}; // namespace | ||
|
||
void silu_and_mul(torch::Tensor &out, torch::Tensor &input) { | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1) / 2; | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES( | ||
input.scalar_type(), "silu_and_mul_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(silu_and_mul_impl) | ||
activation_kernel<scalar_t, silu_act, true>(num_tokens, d, | ||
input.data_ptr<scalar_t>(), | ||
out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(silu_and_mul_impl) | ||
}); | ||
} | ||
|
||
void gelu_and_mul(torch::Tensor &out, // [..., d] | ||
torch::Tensor &input) // [..., 2 * d] | ||
{ | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1) / 2; | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES( | ||
input.scalar_type(), "gelu_and_mul_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(gelu_and_mul_impl) | ||
activation_kernel<scalar_t, gelu_act, true>(num_tokens, d, | ||
input.data_ptr<scalar_t>(), | ||
out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl) | ||
}); | ||
} | ||
|
||
void gelu_tanh_and_mul(torch::Tensor &out, // [..., d] | ||
torch::Tensor &input) // [..., 2 * d] | ||
{ | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1) / 2; | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES( | ||
input.scalar_type(), "gelu_tanh_and_mul_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(gelu_tanh_and_mul_impl) | ||
activation_kernel<scalar_t, gelu_tanh_act, true>( | ||
num_tokens, d, input.data_ptr<scalar_t>(), | ||
out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(gelu_tanh_and_mul_impl) | ||
}); | ||
} | ||
|
||
void gelu_new(torch::Tensor &out, torch::Tensor &input) { | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1); | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_new_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(gelu_new_impl) | ||
activation_kernel<scalar_t, gelu_new_act, false>( | ||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(gelu_new_impl) | ||
}); | ||
} | ||
|
||
void gelu_fast(torch::Tensor &out, torch::Tensor &input) { | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1); | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_fast_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(gelu_fast_impl) | ||
activation_kernel<scalar_t, gelu_fast_act, false>( | ||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(gelu_fast_impl) | ||
}); | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.