Skip to content
This repository was archived by the owner on Sep 23, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Compiled Object files
**/.DS_Store
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

**/cmake-build-debug
**/CMakeCache.txt
**/cmake_install.cmake
**/install_manifest.txt
**/CMakeFiles/
**/CTestTestfile.cmake
**/Makefile
**/*.cbp
**/CMakeScripts
**/compile_commands.json


## Local

build/**/*
**/build/**/*
out/*
lib/*
bin/*
test/test_runner
.vs
.cache
__pycache__
dist
*.egg-info
65 changes: 65 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

cmake_minimum_required(VERSION 3.13)

project(root)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
option(CPU_EXTENSIONS_BUILD_TESTS "Build with tests" ON)

message(INFO "--------------------------------")
message(STATUS "Build with tests: ${CPU_EXTENSIONS_BUILD_TESTS}")
message(INFO "--------------------------------")

set(CMAKE_CXX_STANDARD 17)
if(MSVC)
# TODO: validate
if(MSVC_VERSION VERSION_LESS 1928)
message(FATAL_ERROR "Insufficient msvc compiler version, current ${MSVC_VERSION}, minimum 1928.")
endif()
# Force to always compile with W4
if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
endif()
elseif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.2")
message(FATAL_ERROR "Insufficient gcc compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 11.2.")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids -flax-vector-conversions")
elseif(OV_COMPILER_IS_CLANG)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "12")
message(FATAL_ERROR "Insufficient clang compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 12.")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids -flax-vector-conversions")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "2023.0")
message(FATAL_ERROR "Insufficient intel compiler version, current ${CMAKE_CXX_COMPILER_VERSION}, minimum 2023.0.")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
endif()

if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
endif()
add_subdirectory(src)
if (CPU_EXTENSIONS_BUILD_TESTS)
add_subdirectory(tests)
endif()

# Get the latest commit hash
execute_process(
COMMAND git rev-parse HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
OUTPUT_VARIABLE GIT_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt ${GIT_HASH})
install(FILES
${CMAKE_BINARY_DIR}/git-state.txt
DESTINATION ${CMAKE_INSTALL_PREFIX})
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# About CPU_Extensions
CPU_Extensions is a compute library containing processor optimized kernels code.

# Unit tests for CPU_Extensions
## Tests for kernels
Tests for kernels are written in gtest under tests\src, use ./cpu_extensions_tests to run it.

## Tests for complex features
Some features have many steps and the reference could not be easily written using gtest. For these features can use python to generate the reference. The directory tests\script contains these test, please refer [test in python](./tests/script/README.md).
59 changes: 59 additions & 0 deletions include/llm_emb_gpt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <string>
#include <vector>
#include "llm_types.hpp"

namespace llmdnn {

class emb_gpt {
public:
struct create_param {
size_t num_heads;
size_t head_size;
size_t head_size_aligned; // better to aligned to 64 bytes for best performance, apply for qkv
// supported (qkv, dst): (bf16, bf16)
data_type_t qkv_precision;
data_type_t dst_precision;
size_t rotary_dims;
bool use_position2d; // chatglm true, other false
};
struct exec_param {
size_t batch;
size_t query_seq_len;
size_t past_seq_len;
uint8_t* q; // shape: [batch, query_seq_len, hidden size], inner stride is ldq
uint8_t* k; // shape: [batch, query_seq_len, hidden size], inner stride is ldk
uint8_t* v; // shape: [batch, query_seq_len, hidden size], inner stride is ldv
size_t ldq; // inner stride of q
size_t ldk; // inner stride of k
size_t ldv; // inner stride of v
uint8_t* query_dst; // rotary embbeding dst
uint8_t** layer_past_key_src; // past key src
uint8_t** layer_past_value_src; // past value src
uint8_t** layer_past_key_dst; // past key dst, if layer_past_key_src!=layer_past_key_dst, will copy layer_past_key_src to layer_past_key_dst
uint8_t** layer_past_value_dst; // past value dst, if layer_past_value!=layer_past_value_dst, will copy layer_past_value to layer_past_value_dst
float* cos; // cos lookup table, shape: [max_seq_len, rotary_dims]
float* sin; // sin lookup table, shape: [max_seq_len, rotary_dims]
int* position2d_ids; // shape: [batch, 2, query_seq_len]
size_t head_stride_in_kv; // kv stride for next head; kv may be preallocated a big buffer
};

emb_gpt();
bool create(const create_param& param);
void exec(const exec_param& param);

struct impl {
virtual bool create(const create_param& param) = 0;
virtual void exec(const exec_param& param) = 0;
};
protected:
std::shared_ptr<impl> _impl;
};

}
74 changes: 74 additions & 0 deletions include/llm_fc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "llm_types.hpp"

namespace llmdnn {

typedef enum {
NONE = 0,
DEQUANT = 1 << 0,
BIAS = 1 << 1,
GELU_ERF = 1 << 2,
GELU_TANH = 1 << 3,
QUANT = 1 << 4,
GELU = GELU_ERF, // default is ERF

BIAS_GELU = BIAS | GELU,
DEQUANT_BIAS_GELU = DEQUANT | BIAS_GELU,
DEQUANT_BIAS_GELU_QUANT = DEQUANT_BIAS_GELU | QUANT,
DEQUANT_BIAS_QUANT = DEQUANT | BIAS | QUANT,
DEQUANT_GELU_QUANT = DEQUANT | GELU | QUANT,
DEQUANT_QUANT = DEQUANT | QUANT,

DEQUANT_GELU = DEQUANT | GELU,
DEQUANT_BIAS = DEQUANT | BIAS,

BIAS_GELU_TANH = BIAS | GELU_TANH,
DEQUANT_BIAS_GELU_TANH = DEQUANT | BIAS_GELU_TANH,
DEQUANT_BIAS_GELU_TANH_QUANT = DEQUANT_BIAS_GELU_TANH | QUANT,
DEQUANT_GELU_TANH_QUANT = DEQUANT | GELU_TANH | QUANT,

DEQUANT_GELU_TANH = DEQUANT | GELU_TANH,
} postops_types;

struct fc_create_param {
data_type_t dt_a;
data_type_t dt_b;
data_type_t dt_c;
bool b_is_trans;
postops_types postops_type;
// for weight compression
float q;
float dq;
};

struct fc_kernel;

/// Generates a mm kernel based on param
///
/// @param mm Output kernel
/// @param param kernel parameters, supported:
/// fc: (s8,s8,s8),dq,[bias],[gelu],q
/// fc: (s8,s8,bf16),dq,[bias],[gelu]
/// fc: (s8,s8,f32),dq,[bias],[gelu]
/// fc: (bf16,bf16,bf16),[bias],[gelu]
/// fc: (bf16,bf16,f32),[bias],[gelu]
/// fc: (bf16,s8,f32),dq,[bias],[gelu]
/// fc: (bf16,s8,bf16),dq,[bias],[gelu]
///
bool fc_kernel_create(fc_kernel** mm, const fc_create_param* param);
void fc_kernel_destroy(const fc_kernel* mm);
void fc_kernel_execute(const fc_kernel* mm,
void* ptr_a, void* ptr_b, void* ptr_c, size_t lda, size_t ldb, size_t ldc,
size_t M, size_t N, size_t K, size_t n_start, size_t n_end,
float* dq=nullptr, float* q=nullptr, float* bias=nullptr);

/// weight compression
/// compute weight min/max once, set q, dq for each fc_kernel instance
void fc_kernel_bf16w8_get_q_dq(size_t K, size_t N, size_t stride, void* ptr, float* q, float* dq);

}
90 changes: 90 additions & 0 deletions include/llm_mha_gpt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <string>
#include <vector>
#include "llm_types.hpp"

namespace llmdnn {

// pattern is:
// query:[batch, num_heads, query_seq_len, head_size] key:[batch, num_heads, key_seq_len, head_size]
// \ |
// \ Transpose0: [batch, num_heads, head_size, key_seq_len]
// \ /
// \ /
// \ /
// MatMul0: [batch, num_heads, query_seq_len, key_seq_len]
// |
// | norm_factor(const): [1]
// | /
// Multiply: [batch, num_heads, query_seq_len, key_seq_len]
// |
// | causal_mask: [1, 1, query_seq_len, key_seq_len]
// | /
// Select(only for 1x300): [batch, num_heads, query_seq_len, key_seq_len]
// |
// | attention_mask:[batch, 1, 1, key_seq_len]
// | /
// Add: [batch, num_heads, query_seq_len, key_seq_len]
// |
// SoftMax: [batch, num_heads, query_seq_len, key_seq_len]
// |
// \ value:[batch, num_heads, key_seq_len, head_size]
// \ /
// MatMul1: [batch, num_heads, query_seq_len, head_size]
// |
// Transpose1(only for 1x300): [batch, query_seq_len, num_heads * head_size]
class mha_gpt {
public:
struct create_param {
size_t num_heads;
size_t head_size;
size_t head_size_aligned; // better to aligned to 64 bytes for best performance, apply for qkv
size_t max_seq_len; // max seq length for computing the size of matmul tmp result
float normal_factor;
// supported (qkv, dst): (bf16, bf16), (s8, s8)
data_type_t qkv_precision;
data_type_t dst_precision;
};
struct exec_param {
size_t batch;
size_t query_seq_len;
size_t key_seq_len;
bool is_causal_in_attention; // causal mask is fused in attention mask: chatglm uses it.
uint8_t* q; // q buffer, compact, shape: [batch, num_heads, query_seq_len, head_size]
uint8_t** k; // k buffer, k[N] stands different batch which may be discreted
// k[0] shape: [batch, num_heads, key_seq_len, head_size]
uint8_t** v; // v buffer, v[N] stands different batch which may be discreted
// v[0] shape: [batch, num_heads, value_seq_len, head_size]
float* attention_mask; // attention mask, attention_mask[0] shape:
// [batch, 1, 1, key_seq_len], when is_causal_in_attention is false
// [batch, 1, query_seq_len, key_seq_len], when is_causal_in_attention is true
uint8_t* attn_output; // output, compact, shape: [batch, query_seq_len, num_heads * head_size]
size_t head_stride_in_kv; // kv stride for next head; kv may be preallocated a big buffer
// expected quant schema:
// q,k,v use per tensor quant, attn_output may use per tensor/channel quant
float q_dequant;
float k_dequant;
float v_dequant;
float qk_quant;
std::vector<float> qkv_quant; // size==1 per tensor, size==head_size per channel
};

mha_gpt();
bool create(const create_param& param);
void exec(const exec_param& param);

struct impl {
virtual bool create(const create_param& param) = 0;
virtual void exec(const exec_param& param) = 0;
};
protected:
std::shared_ptr<impl> _impl;
};

}
35 changes: 35 additions & 0 deletions include/llm_mm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "llm_types.hpp"

namespace llmdnn {

struct mm_create_param {
data_type_t dt_a;
data_type_t dt_b;
bool b_is_gemv; // true if matrix b is vector. Shape: a[M,K], b[K,1], c[M,1]
bool b_is_trans;
};

struct mm_kernel;

/// Generates a mm kernel based on param
///
/// @param mm Output kernel
/// @param param kernel parameters, supported:
/// matmul: (u8/s8,s8,f32)
/// gemv: (s8,s8,f32)
/// matmul: (bf16,bf16,f32)
/// gemv: (bf16,bf16,f32)
///
bool mm_kernel_create(mm_kernel** mm, const mm_create_param* param);
void mm_kernel_destroy(const mm_kernel* mm);

void mm_kernel_execute(const mm_kernel* mm, void* ptr_a, void* ptr_b, void* ptr_c, size_t lda, size_t ldb, size_t ldc,
size_t M, size_t N, size_t K);

}
Loading