Skip to content

Commit d9cd982

Browse files
xctanggerganov
authored andcommitted
ggml-cpu : split arch-specific implementations (ggml-org#13892)
* move ggml-cpu-aarch64 to repack * split quantize_row_q8_0/1 * split helper functions * split ggml_vec_dot_q4_0_q8_0 * split ggml_vec_dot_q4_1_q8_1 * split ggml_vec_dot_q5_0_q8_0 * split ggml_vec_dot_q5_1_q8_1 * split ggml_vec_dot_q8_0_q8_0 * split ggml_vec_dot_tq1_0_q8_K * split ggml_vec_dot_tq2_0_q8_K * split ggml_vec_dot_q2_K_q8_K * split ggml_vec_dot_q3_K_q8_K * split ggml_vec_dot_q4_K_q8_K * split ggml_vec_dot_q5_K_q8_K * split ggml_vec_dot_q6_K_q8_K * split ggml_vec_dot_iq2_xxs_q8_K * split ggml_vec_dot_iq2_xs_q8_K * split ggml_vec_dot_iq2_s_q8_K * split ggml_vec_dot_iq3_xxs_q8_K * split ggml_vec_dot_iq3_s_q8_K * split ggml_vec_dot_iq1_s_q8_K * split ggml_vec_dot_iq1_m_q8_K * split ggml_vec_dot_iq4_nl_q8_0 * split ggml_vec_dot_iq4_xs_q8_K * fix typos * fix missing prototypes * rename ggml-cpu-quants.c * rename ggml-cpu-traits * rename arm folder * move cpu-feats-x86.cpp * rename ggml-cpu-hbm * update arm detection macro in quants.c * move iq quant tables * split ggml_quantize_mat_q8_0/K * split ggml_gemv_* * split ggml_gemm_* * rename namespace aarch64 to repack * use weak aliases to replace test macros * rename GGML_CPU_AARCH64 to GGML_CPU_REPACK * rename more aarch64 to repack * clean up rebase leftover * fix compilation errors * remove trailing spaces * try to fix clang compilation errors * try to fix clang compilation errors again * try to fix clang compilation errors, 3rd attempt * try to fix clang compilation errors, 4th attempt * try to fix clang compilation errors, 5th attempt * try to fix clang compilation errors, 6th attempt * try to fix clang compilation errors, 7th attempt * try to fix clang compilation errors, 8th attempt * try to fix clang compilation errors, 9th attempt * more cleanup * fix compilation errors * fix apple targets * fix a typo in arm version of ggml_vec_dot_q4_K_q8_K Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1404858 commit d9cd982

File tree

17 files changed

+1669
-1662
lines changed

17 files changed

+1669
-1662
lines changed

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 14 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
function(ggml_add_cpu_backend_features cpu_name arch)
2-
# The feature detection code is compiled as a separate target so that
3-
# it can be built without the architecture flags
4-
# Since multiple variants of the CPU backend may be included in the same
5-
# build, using set_source_files_properties() to set the arch flags is not possible
6-
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7-
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8-
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
9-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11-
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
12-
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
13-
endfunction()
14-
151
function(ggml_add_cpu_backend_variant_impl tag_name)
162
if (tag_name)
173
set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -157,49 +143,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
157143
else()
158144
if (GGML_CPU_ARM_ARCH)
159145
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
160-
elseif(GGML_CPU_ALL_VARIANTS)
161-
# Begin with the lowest baseline
162-
set(ARM_MCPU "armv8-a")
163-
set(ARCH_TAGS "")
164-
set(ARCH_DEFINITIONS "")
165-
166-
# When a feature is selected, bump the MCPU to the first
167-
# version that supported it
168-
if (GGML_INTERNAL_DOTPROD)
169-
set(ARM_MCPU "armv8.2-a")
170-
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
171-
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
172-
endif()
173-
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
174-
set(ARM_MCPU "armv8.2-a")
175-
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
176-
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
177-
endif()
178-
if (GGML_INTERNAL_SVE)
179-
set(ARM_MCPU "armv8.2-a")
180-
set(ARCH_TAGS "${ARCH_TAGS}+sve")
181-
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
182-
endif()
183-
if (GGML_INTERNAL_MATMUL_INT8)
184-
set(ARM_MCPU "armv8.6-a")
185-
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
186-
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
187-
endif()
188-
if (GGML_INTERNAL_SVE2)
189-
set(ARM_MCPU "armv8.6-a")
190-
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
191-
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
192-
endif()
193-
if (GGML_INTERNAL_NOSVE)
194-
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
195-
endif()
196-
if (GGML_INTERNAL_SME)
197-
set(ARM_MCPU "armv9.2-a")
198-
set(ARCH_TAGS "${ARCH_TAGS}+sme")
199-
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
200-
endif()
201-
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
202-
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
203146
endif()
204147
endif()
205148

@@ -363,7 +306,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
363306
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
364307
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
365308
endif()
366-
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
309+
310+
# The feature detection code is compiled as a separate target so that
311+
# it can be built without the architecture flags
312+
# Since multiple variants of the CPU backend may be included in the same
313+
# build, using set_source_files_properties() to set the arch flags is not possible
314+
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
315+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
316+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
317+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
318+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
319+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
320+
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
367321
endif()
368322
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
369323
message(STATUS "PowerPC detected")
@@ -388,27 +342,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
388342
else()
389343
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
390344
endif()
391-
elseif(GGML_CPU_ALL_VARIANTS)
392-
# Begin with the lowest baseline
393-
set(ARCH_DEFINITIONS "")
394-
395-
# When a feature is selected, bump the MCPU to the first
396-
# version that supported it
397-
foreach(PVER RANGE 7 11)
398-
if(DEFINED GGML_INTERNAL_POWER${PVER})
399-
set(POWERPC_MCPU "power${PVER}")
400-
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
401-
endif()
402-
endforeach()
403-
if (GGML_INTERNAL_VSX)
404-
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
405-
list(APPEND ARCH_FLAGS -mvsx)
406-
endif()
407-
408-
if (DEFINED POWERPC_MCPU)
409-
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
410-
endif()
411-
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
412345
else()
413346
if (GGML_CPU_POWERPC_CPUTYPE)
414347
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
@@ -448,7 +381,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
448381

449382
# TODO: Separation to determine activation of VX/VXE/VXE2
450383
if (${S390X_M} MATCHES "8561|8562")
451-
set(GGML_NNPA OFF)
452384
message(STATUS "z15 target")
453385
list(APPEND ARCH_FLAGS -march=z15)
454386
elseif (${S390X_M} MATCHES "3931")
@@ -465,14 +397,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
465397
endif()
466398

467399
if (GGML_VXE)
468-
message(STATUS "VX/VXE/VXE2 enabled")
469400
list(APPEND ARCH_FLAGS -mvx -mzvector)
470-
list(APPEND ARCH_DEFINITIONS GGML_VXE)
471-
endif()
472-
473-
if (GGML_NNPA)
474-
message(STATUS "NNPA enabled")
475-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
476401
endif()
477402
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
478403
message(STATUS "Wasm detected")
@@ -494,9 +419,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
494419

495420
# Fetch KleidiAI sources:
496421
include(FetchContent)
497-
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
422+
set(KLEIDIAI_COMMIT_TAG "v1.6.0")
498423
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
499-
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
424+
set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
500425

501426
if (POLICY CMP0135)
502427
cmake_policy(SET CMP0135 NEW)
@@ -589,9 +514,4 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
589514
if (EMSCRIPTEN)
590515
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
591516
endif()
592-
593-
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
594-
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
595-
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
596-
endif()
597517
endfunction()

ggml/src/ggml-cpu/amx/mmq.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "mmq.h"
99
#include "ggml-impl.h"
1010
#include "ggml-cpu-impl.h"
11-
#include "simd-mappings.h"
1211
#include "quants.h"
1312
#include "ggml-quants.h"
1413
#include <algorithm>
@@ -454,7 +453,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
454453

455454
// Quantize these floats
456455
const float iscale = 127.f / amax;
457-
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
456+
y[i].d = GGML_FP32_TO_FP16(1 / iscale);
458457
const float id = ( amax != 0.0f ) ? iscale : 0.f;
459458
const __m512 vscale = _mm512_set1_ps(id);
460459

@@ -1091,7 +1090,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
10911090
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
10921091

10931092
for (int m = 0; m < nr; ++m) {
1094-
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1093+
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
10951094
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
10961095

10971096
__m512 vsum;
@@ -1114,8 +1113,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
11141113
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
11151114

11161115
for (int m = 0; m < nr; ++m) {
1117-
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118-
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1116+
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1117+
const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
11191118
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
11201119

11211120
__m512 vsum;
@@ -1138,7 +1137,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
11381137
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
11391138

11401139
for (int m = 0; m < nr; ++m) {
1141-
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1140+
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
11421141
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
11431142

11441143
__m512 vsum;
@@ -1438,7 +1437,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
14381437
va[k] = _mm512_set1_epi32(a_ptr[k]);
14391438
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
14401439
}
1441-
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1440+
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
14421441
}
14431442

14441443
// load b
@@ -1499,8 +1498,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
14991498
for (int k = 0; k < 8; ++k) {
15001499
va[k] = _mm512_set1_epi32(a_ptr[k]);
15011500
}
1502-
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503-
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1501+
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502+
vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
15041503
}
15051504

15061505
// load b
@@ -1572,7 +1571,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
15721571
va[k] = _mm512_set1_epi32(a_ptr[k]);
15731572
va[k] = _mm512_add_epi8(va[k], off);
15741573
}
1575-
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1574+
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
15761575
}
15771576

15781577
// load b

0 commit comments

Comments
 (0)