Skip to content
This repository was archived by the owner on Sep 23, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
ae336d5
init library
luo-cheng2021 Jun 28, 2023
2f8cd88
strict N tail handling
luo-cheng2021 Jun 28, 2023
2cacaab
apply review comments
luo-cheng2021 Jun 28, 2023
a544031
add mha support
luo-cheng2021 Jun 28, 2023
3f18a48
move to gcc11; add mha_gpt bf16 support
luo-cheng2021 Jun 28, 2023
d2ea598
restructure directory layout
luo-cheng2021 Jun 28, 2023
3cfc8de
use add_subdir to simplify cmake
luo-cheng2021 Jun 28, 2023
2a64a5a
add clang12 support & apply review comments.
luo-cheng2021 Jun 28, 2023
4aa2441
mha int8 support
luo-cheng2021 Jun 28, 2023
7ed062a
rotary bf16 gpt initial support
luo-cheng2021 Jun 28, 2023
28ba2d5
add chatglm support
luo-cheng2021 Jun 28, 2023
0590c6d
rotary embbeding supports contiguous pastkv
luo-cheng2021 Jun 28, 2023
67aabdc
fc support int8 weight compression
luo-cheng2021 Jun 28, 2023
f7723dc
support K<32(pad zero to 32)
luo-cheng2021 Jun 28, 2023
ae5fc0c
import && rename
luo-cheng2021 Jun 28, 2023
9725e59
workaround MatmulVector K<=6*32
luo-cheng2021 Jun 30, 2023
aac4116
export cmake target
luo-cheng2021 Jun 30, 2023
323fa5f
rotary use external cos/sin loopup table
luo-cheng2021 Jul 3, 2023
6d5971c
qkv of emb changes to q, k, v
luo-cheng2021 Jul 3, 2023
a741a8b
gelu tanh support
luo-cheng2021 Jul 4, 2023
eb427a6
add git commit id to package
luo-cheng2021 Jul 6, 2023
16ea565
add rotary avx2 kernel
luo-cheng2021 Jul 6, 2023
73f335d
add bloom mha support
luo-cheng2021 Jul 6, 2023
48e01f4
use target_compile_options to change cxx flags
luo-cheng2021 Jul 10, 2023
f906cdd
mha uses tensor to support different strides
luo-cheng2021 Jul 11, 2023
6205ab7
remove chatglm dependency
luo-cheng2021 Jul 12, 2023
27f324a
remove warning
luo-cheng2021 Jul 12, 2023
03b7b37
remove share_ptr from interface
luo-cheng2021 Jul 13, 2023
106402c
refactor: emb/mha use tensor as input parameter
luo-cheng2021 Jul 13, 2023
0db2ce8
wa gcc9 could not find 'std::__throw_bad_array_new_length()'
luo-cheng2021 Jul 15, 2023
ebecf77
wa gcc 7.5 does not like newer stringstream
luo-cheng2021 Jul 17, 2023
7872bad
use custom allactor for map
luo-cheng2021 Jul 19, 2023
11833d8
support external causal mask[opt]
luo-cheng2021 Jul 24, 2023
7beabd6
fix coverity scan errors
luo-cheng2021 Jul 24, 2023
2c0ae5b
remove ov namespace
luo-cheng2021 Jul 25, 2023
e8455b7
remove c++ global vars
luo-cheng2021 Jul 25, 2023
31faf59
fix simple_parallel_for type
luo-cheng2021 Jul 26, 2023
2326d60
optimize mha_gpt_impl_amx::create
usstq Jul 27, 2023
04b8483
add security.md and fix warnings.
luo-cheng2021 Jul 27, 2023
77ac0bc
apply review comments
luo-cheng2021 Jul 31, 2023
6de45ce
apply review comments
luo-cheng2021 Aug 1, 2023
ca79cd9
fc weight support f32, add weight pack api
luo-cheng2021 Aug 3, 2023
cbfe9ff
opt f32 weight pack
luo-cheng2021 Aug 4, 2023
3f31227
remove writeable bufferC
luo-cheng2021 Aug 10, 2023
76d5f0f
use numa to alloc mem
luo-cheng2021 Aug 11, 2023
2dd181a
add falcon broadcast support before rotary
luo-cheng2021 Aug 16, 2023
3a7432c
fix int8 compress
luo-cheng2021 Aug 17, 2023
4a4e48c
cache temp mem when K<32
luo-cheng2021 Aug 18, 2023
c29681c
add numa as the first level task partition basis
luo-cheng2021 Aug 29, 2023
d30b5b5
dlopen libnuma, remove compilation phase dependency
luo-cheng2021 Aug 30, 2023
0d7dac4
avoid usage of vector
luo-cheng2021 Aug 30, 2023
581a3cf
support perchannel u8 compress weight
luo-cheng2021 Sep 15, 2023
4f3ba52
opt for multi query
luo-cheng2021 Sep 18, 2023
aa2c57b
fc M dimension splitting
luo-cheng2021 Sep 22, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Compiled Object files
**/.DS_Store
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

**/cmake-build-debug
**/CMakeCache.txt
**/cmake_install.cmake
**/install_manifest.txt
**/CMakeFiles/
**/CTestTestfile.cmake
**/Makefile
**/*.cbp
**/CMakeScripts
**/compile_commands.json


## Local

build/**/*
**/build/**/*
out/*
lib/*
bin/*
test/test_runner
.vs
.cache
__pycache__
dist
*.egg-info
59 changes: 59 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

cmake_minimum_required(VERSION 3.13)

project(root)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
option(CPU_EXTENSIONS_BUILD_TESTS "Build with tests" ON)
option(CPU_EXTENSIONS_ENABLE_LOG "Enable log" ON)

message(INFO "--------------------------------")
message(STATUS "Build with tests: ${CPU_EXTENSIONS_BUILD_TESTS}")
message(INFO "--------------------------------")

if(MSVC)
# TODO: validate
if(MSVC_VERSION VERSION_LESS 1928)
message(FATAL_ERROR "Insufficient msvc compiler version, current ${MSVC_VERSION}, minimum 1928.")
endif()
elseif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.2")
message(FATAL_ERROR "Insufficient gcc compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 11.2.")
endif()
set(EXTRA_CXX_FLAGS -march=sapphirerapids -flax-vector-conversions)
elseif(OV_COMPILER_IS_CLANG)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "12")
message(FATAL_ERROR "Insufficient clang compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 12.")
endif()
set(EXTRA_CXX_FLAGS -march=sapphirerapids -flax-vector-conversions)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "2023.0")
message(FATAL_ERROR "Insufficient intel compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 2023.0.")
endif()
set(EXTRA_CXX_FLAGS -march=sapphirerapids)
endif()

if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
endif()
add_subdirectory(src)
if (CPU_EXTENSIONS_BUILD_TESTS)
add_subdirectory(tests)
endif()

# Get the latest commit hash
execute_process(
COMMAND git rev-parse HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
OUTPUT_VARIABLE GIT_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt ${GIT_HASH})
install(FILES
${CMAKE_BINARY_DIR}/git-state.txt
DESTINATION ${CMAKE_INSTALL_PREFIX})
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# About CPU_Extensions
CPU_Extensions is a compute library containing processor optimized kernels code.

# Unit tests for CPU_Extensions
## Tests for kernels
Tests for kernels are written in gtest under tests\src, use ./cpu_extensions_tests to run it.

## Tests for complex features
Some features have many steps and the reference could not be easily written using gtest. For these features can use python to generate the reference. The directory tests\script contains these test, please refer [test in python](./tests/script/README.md).
12 changes: 12 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Security Policy

## Report a Vulnerability

Please report security issues or vulnerabilities to the [Intel® Security Center].

For more information on how Intel® works to resolve security issues, see
[Vulnerability Handling Guidelines].

[Intel® Security Center]:https://www.intel.com/security

[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
31 changes: 31 additions & 0 deletions include/llm_emb_gpt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <string>
#include <vector>
#include "llm_types.hpp"
#include "llm_tensor.hpp"

namespace llmdnn {

status_t emb_gpt(const tensor& q_src, // q shape: [batch, query_seq_len, head_num, head_size] or
// [batch, query_seq_len, num_kv_heads, head_num/num_kv_heads, head_size]
const tensor& k_src, // k shape: [batch, query_seq_len, head_num, head_size] or
// [batch, query_seq_len, num_kv_heads, 1, head_size]
const tensor& v_src, // v shape: [batch, query_seq_len, head_num, head_size] or
// [batch, query_seq_len, num_kv_heads, 1, head_size]
const tensor& k_past, // k_past shape: [batch, num_heads, past_seq_len, head_size]
const tensor& v_past, // v_past shape: [batch, num_heads, past_seq_len, head_size]
const tensor& q_dst, // q_dst, shape: [batch, num_heads, query_seq_len, head_size]
const tensor& k_dst, // k_past shape: [batch, num_heads, query_seq_len+past_seq_len, head_size]
// if k_past!=k_past_dst, will copy k_past to k_past_dst
const tensor& v_dst, // v_past shape: [batch, num_heads, query_seq_len+past_seq_len, head_size]
const tensor& cos, // cos lookup table, shape: [1, 1, max_seq_len, rotary_dims]
const tensor& sin, // sin lookup table, shape: [1, 1, max_seq_len, rotary_dims]
const tensor& position2d_ids); // shape: [batch, 2, query_seq_len]

} // namespace llmdnn
98 changes: 98 additions & 0 deletions include/llm_fc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "llm_types.hpp"
#include "llm_tensor.hpp"

namespace llmdnn {

typedef enum {
NONE = 0,
DEQUANT = 1 << 0,
BIAS = 1 << 1,
GELU_ERF = 1 << 2,
GELU_TANH = 1 << 3,
QUANT = 1 << 4,
GELU = GELU_ERF, // default is ERF

BIAS_GELU = BIAS | GELU,
DEQUANT_BIAS_GELU = DEQUANT | BIAS_GELU,
DEQUANT_BIAS_GELU_QUANT = DEQUANT_BIAS_GELU | QUANT,
DEQUANT_BIAS_QUANT = DEQUANT | BIAS | QUANT,
DEQUANT_GELU_QUANT = DEQUANT | GELU | QUANT,
DEQUANT_QUANT = DEQUANT | QUANT,

DEQUANT_GELU = DEQUANT | GELU,
DEQUANT_BIAS = DEQUANT | BIAS,

BIAS_GELU_TANH = BIAS | GELU_TANH,
DEQUANT_BIAS_GELU_TANH = DEQUANT | BIAS_GELU_TANH,
DEQUANT_BIAS_GELU_TANH_QUANT = DEQUANT_BIAS_GELU_TANH | QUANT,
DEQUANT_GELU_TANH_QUANT = DEQUANT | GELU_TANH | QUANT,

DEQUANT_GELU_TANH = DEQUANT | GELU_TANH,
} postops_types;

struct fc_create_param {
data_type_t dt_a;
data_type_t dt_b;
data_type_t dt_c;
bool b_is_trans;
postops_types postops_type;
// for weight compression
float* scale;
float* zp;
int scale_zp_size;
};

struct fc_kernel;

/// Generates a mm kernel based on param
///
/// @param mm Output kernel
/// @param param kernel parameters, supported:
/// fc: (s8,s8,s8),dq,[bias],[gelu],q
/// fc: (s8,s8,bf16),dq,[bias],[gelu]
/// fc: (s8,s8,f32),dq,[bias],[gelu]
/// fc: (bf16,bf16,bf16),[bias],[gelu]
/// fc: (bf16,bf16,f32),[bias],[gelu]
/// fc: (bf16,u8,f32),dq,[bias],[gelu]
/// fc: (bf16,u8,bf16),dq,[bias],[gelu]
///
status_t fc_kernel_create(fc_kernel** mm, const fc_create_param* param);
void fc_kernel_destroy(fc_kernel* mm);
// when fc_create_param.dt_b==bf16, dt_b is in [bf16, f32]
// when fc_create_param.dt_b==u8, dt_b is in [bf16, f32]
void fc_kernel_pack_weight(fc_kernel* mm, void* ptr_b, data_type_t dt_b, size_t N, size_t K, size_t stride_b, size_t n_start, size_t n_end);
void fc_kernel_pack_weight_to_dst(fc_kernel* mm, void* src_b, void* dst_b, data_type_t dt_b, size_t N, size_t K, size_t stride_b, size_t n_start, size_t n_end);
// ptr_b may be null if using fc_kernel_pack_weight to pack into internal buffer
// if ptr_b is not null, its layout is [N/32, 32*rndup(K,32|64)]
void fc_kernel_execute(fc_kernel* mm,
void* ptr_a, void* ptr_b, void* ptr_c, size_t stride_a, size_t stride_c,
size_t M, size_t N, size_t K, size_t n_start, size_t n_end,
float* dq=nullptr, float* q=nullptr, float* bias=nullptr);

/// Generates a fc based on param
class fc {
public:
fc();
~fc();

bool init(const fc_create_param& param);
void pack_weight(const tensor& w);
status_t exec(const tensor& input, const tensor& output, const tensor& dq, const tensor& q, const tensor& bias);

struct impl {
virtual ~impl() {}
virtual bool init(const fc_create_param& param) = 0;
virtual void pack_weight(const tensor& w) = 0;
virtual status_t exec(const tensor& input, const tensor& output, const tensor& dq, const tensor& q, const tensor& bias) = 0;
};
protected:
impl* _impl;
};

} // namespace llmdnn
53 changes: 53 additions & 0 deletions include/llm_mha_gpt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <string>
#include <vector>
#include "llm_types.hpp"
#include "llm_tensor.hpp"

namespace llmdnn {

class mha_gpt {
public:
mha_gpt();
~mha_gpt();

status_t exec(const tensor& q, // q shape: [batch, num_heads, query_seq_len, head_size]
const tensor& k, // k shape: [batch, num_heads, key_seq_len, head_size]
const tensor& v, // v shape: [batch, num_heads, value_seq_len, head_size]
const tensor& output, // output, compact, shape: [batch, query_seq_len, num_heads * head_size]
const tensor& attn_mask, // attention mask[opt], shape:
// [batch, 1, 1, key_seq_len],
// [batch, 1, query_seq_len, key_seq_len]
const tensor& alibi, // alibi[opt] shape: [batch, num_heads, 1, key_seq_len]
const tensor& causal_mask, // [opt] use_causal_mask must be false, u8, shape:
// [1, 1, query_seq_len, key_seq_len]
// [batch, 1, query_seq_len, key_seq_len]
bool select_nfltmax_at_0, // used when causal_mask is not null. true means causal_mask[i]==0 use -FLT_MAX
// false means causal_mask[i]==1 use -FLT_MAX
float normal_factor,
bool use_causal_mask = false);// add causal mask

struct impl {
virtual ~impl() {}
virtual status_t exec(const tensor& q,
const tensor& k,
const tensor& v,
const tensor& output,
const tensor& attn_mask,
const tensor& alibi,
const tensor& causal_mask,
bool select_nfltmax_at_0,
float normal_factor,
bool use_causal_mask = false) = 0;
};
protected:
impl* _impl;
};

} // namespace llmdnn
35 changes: 35 additions & 0 deletions include/llm_mm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "llm_types.hpp"

namespace llmdnn {

struct mm_create_param {
data_type_t dt_a;
data_type_t dt_b;
bool b_is_gemv; // true if matrix b is vector. Shape: a[M,K], b[K,1], c[M,1]
bool b_is_trans;
};

struct mm_kernel;

/// Generates a mm kernel based on param
///
/// @param mm Output kernel
/// @param param kernel parameters, supported:
/// matmul: (u8/s8,s8,f32)
/// gemv: (s8,s8,f32)
/// matmul: (bf16,bf16,f32)
/// gemv: (bf16,bf16,f32)
///
status_t mm_kernel_create(mm_kernel** mm, const mm_create_param* param);
void mm_kernel_destroy(const mm_kernel* mm);

status_t mm_kernel_execute(const mm_kernel* mm, void* ptr_a, void* ptr_b, void* ptr_c, size_t lda, size_t ldb, size_t ldc,
size_t M, size_t N, size_t K);

} // namespace llmdnn
Loading