Skip to content

Commit 6bd02a0

Browse files
xctanggerganov
authored andcommitted
ggml-cpu : split arch-specific implementations (ggml-org#13892)
* move ggml-cpu-aarch64 to repack * split quantize_row_q8_0/1 * split helper functions * split ggml_vec_dot_q4_0_q8_0 * split ggml_vec_dot_q4_1_q8_1 * split ggml_vec_dot_q5_0_q8_0 * split ggml_vec_dot_q5_1_q8_1 * split ggml_vec_dot_q8_0_q8_0 * split ggml_vec_dot_tq1_0_q8_K * split ggml_vec_dot_tq2_0_q8_K * split ggml_vec_dot_q2_K_q8_K * split ggml_vec_dot_q3_K_q8_K * split ggml_vec_dot_q4_K_q8_K * split ggml_vec_dot_q5_K_q8_K * split ggml_vec_dot_q6_K_q8_K * split ggml_vec_dot_iq2_xxs_q8_K * split ggml_vec_dot_iq2_xs_q8_K * split ggml_vec_dot_iq2_s_q8_K * split ggml_vec_dot_iq3_xxs_q8_K * split ggml_vec_dot_iq3_s_q8_K * split ggml_vec_dot_iq1_s_q8_K * split ggml_vec_dot_iq1_m_q8_K * split ggml_vec_dot_iq4_nl_q8_0 * split ggml_vec_dot_iq4_xs_q8_K * fix typos * fix missing prototypes * rename ggml-cpu-quants.c * rename ggml-cpu-traits * rename arm folder * move cpu-feats-x86.cpp * rename ggml-cpu-hbm * update arm detection macro in quants.c * move iq quant tables * split ggml_quantize_mat_q8_0/K * split ggml_gemv_* * split ggml_gemm_* * rename namespace aarch64 to repack * use weak aliases to replace test macros * rename GGML_CPU_AARCH64 to GGML_CPU_REPACK * rename more aarch64 to repack * clean up rebase leftover * fix compilation errors * remove trailing spaces * try to fix clang compilation errors * try to fix clang compilation errors again * try to fix clang compilation errors, 3rd attempt * try to fix clang compilation errors, 4th attempt * try to fix clang compilation errors, 5th attempt * try to fix clang compilation errors, 6th attempt * try to fix clang compilation errors, 7th attempt * try to fix clang compilation errors, 8th attempt * try to fix clang compilation errors, 9th attempt * more cleanup * fix compilation errors * fix apple targets * fix a typo in arm version of ggml_vec_dot_q4_K_q8_K Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 6c596e9 commit 6bd02a0

File tree

18 files changed

+2379
-1564
lines changed

18 files changed

+2379
-1564
lines changed

Makefile

Lines changed: 738 additions & 18 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
363363
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
364364
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
365365
endif()
366-
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
366+
367+
# The feature detection code is compiled as a separate target so that
368+
# it can be built without the architecture flags
369+
# Since multiple variants of the CPU backend may be included in the same
370+
# build, using set_source_files_properties() to set the arch flags is not possible
371+
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
372+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
373+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
374+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
375+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
376+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
377+
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
367378
endif()
368379
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
369380
message(STATUS "PowerPC detected")
@@ -469,17 +480,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
469480
list(APPEND ARCH_FLAGS -mvx -mzvector)
470481
list(APPEND ARCH_DEFINITIONS GGML_VXE)
471482
endif()
472-
473-
if (GGML_NNPA)
474-
message(STATUS "NNPA enabled")
475-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
476-
endif()
477483
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
478484
message(STATUS "Wasm detected")
479485
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
480486
else()
481487
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
482488
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
489+
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
490+
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
483491
endif()
484492

485493
if (GGML_CPU_REPACK)

ggml/src/ggml-cpu/amx/mmq.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "mmq.h"
99
#include "ggml-impl.h"
1010
#include "ggml-cpu-impl.h"
11-
#include "simd-mappings.h"
1211
#include "quants.h"
1312
#include "ggml-quants.h"
1413
#include <algorithm>

ggml/src/ggml-cpu/arch/arm/quants.c

Lines changed: 108 additions & 109 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/arch/arm/repack.cpp

Lines changed: 1038 additions & 1027 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 52 additions & 53 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/arch/powerpc/quants.c

Lines changed: 55 additions & 56 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/arch/riscv/quants.c

Lines changed: 41 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "ggml-quants.h"
44
#include "ggml-impl.h"
55
#include "ggml-cpu.h"
6-
#include "simd-mappings.h"
76

87
#include "../../quants.h"
98
#include "../../ggml-cpu-impl.h"
@@ -46,7 +45,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
4645
const float d = amax / ((1 << 7) - 1);
4746
const float id = d ? 1.0f/d : 0.0f;
4847

49-
y[i].d = GGML_CPU_FP32_TO_FP16(d);
48+
y[i].d = GGML_FP32_TO_FP16(d);
5049

5150
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
5251

@@ -86,7 +85,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
8685
const float d = amax / ((1 << 7) - 1);
8786
const float id = d ? 1.0f/d : 0.0f;
8887

89-
y[i].d = GGML_CPU_FP32_TO_FP16(d);
88+
y[i].d = GGML_FP32_TO_FP16(d);
9089

9190
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
9291

@@ -103,7 +102,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
103102

104103
// set y[i].s
105104
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
106-
y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
105+
y[i].s = GGML_FP32_TO_FP16(sum*d);
107106
}
108107

109108
#else
@@ -161,7 +160,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
161160

162161
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
163162

164-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
163+
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
165164
}
166165

167166
#endif
@@ -178,7 +177,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
178177
}
179178

180179
int sumi = sumi0 + sumi1;
181-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
180+
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
182181
}
183182

184183
*s = sumf;
@@ -226,7 +225,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
226225

227226
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
228227

229-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
228+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
230229
}
231230

232231
#endif
@@ -243,7 +242,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
243242
}
244243

245244
int sumi = sumi0 + sumi1;
246-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
245+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
247246
}
248247

249248
*s = sumf;
@@ -294,7 +293,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
294293
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
295294
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
296295

297-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
296+
sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
298297
}
299298

300299
#endif
@@ -317,7 +316,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
317316
}
318317

319318
int sumi = sumi0 + sumi1;
320-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
319+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
321320
}
322321

323322
*s = sumf;
@@ -367,7 +366,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
367366
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
368367
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
369368

370-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
369+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
371370
}
372371

373372
#endif
@@ -390,7 +389,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
390389
}
391390

392391
int sumi = sumi0 + sumi1;
393-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
392+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
394393
}
395394

396395
*s = sumf;
@@ -428,7 +427,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
428427

429428
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
430429

431-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
430+
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
432431
}
433432

434433
#endif
@@ -439,7 +438,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
439438
sumi += x[ib].qs[j]*y[ib].qs[j];
440439
}
441440

442-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
441+
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
443442
}
444443

445444
*s = sumf;
@@ -466,8 +465,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
466465
const uint8_t * q2 = x[i].qs;
467466
const int8_t * q8 = y[i].qs;
468467
const uint8_t * sc = x[i].scales;
469-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
470-
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
468+
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
469+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
471470
uint8_t *patmp = atmp;
472471
int vsums;
473472
int tmp;
@@ -570,8 +569,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
570569
const int8_t * q8 = y[i].qs;
571570
const uint8_t * sc = x[i].scales;
572571

573-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
574-
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
572+
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
573+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
575574

576575
size_t vl = 16;
577576

@@ -645,8 +644,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
645644
const uint8_t * q2 = x[i].qs;
646645
const int8_t * q8 = y[i].qs;
647646
const uint8_t * sc = x[i].scales;
648-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
649-
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
647+
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
648+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
650649
uint8_t *patmp = atmp;
651650
int vsums;
652651
int tmp;
@@ -751,8 +750,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
751750
summs += y[i].bsums[j] * (sc[j] >> 4);
752751
}
753752

754-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
755-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
753+
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
754+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
756755

757756
int isum = 0;
758757
int is = 0;
@@ -917,7 +916,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
917916
q3 += 32; q8 += 128; scale += 8;
918917
}
919918

920-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
919+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
921920
sumf += d * isum;
922921
}
923922

@@ -1018,7 +1017,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10181017

10191018
}
10201019

1021-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1020+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10221021

10231022
sumf += d*sum_t;
10241023

@@ -1135,7 +1134,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
11351134
q3 += 32; q8 += 128; scale += 8;
11361135
}
11371136

1138-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1137+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
11391138
sumf += d * isum;
11401139
}
11411140
break;
@@ -1203,7 +1202,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12031202
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
12041203
q8 += 8; a += 8;
12051204
}
1206-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1205+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
12071206
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
12081207
}
12091208
for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1240,8 +1239,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12401239
float sumf = 0;
12411240

12421241
for (int i = 0; i < nb; ++i) {
1243-
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1244-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1242+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1243+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
12451244

12461245
int tmp, tmp2, sumi;
12471246
__asm__ __volatile__(
@@ -1362,8 +1361,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
13621361

13631362
size_t vl = 8;
13641363

1365-
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1366-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1364+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1365+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
13671366

13681367
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
13691368
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -1423,8 +1422,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
14231422
break;
14241423
case 128:
14251424
for (int i = 0; i < nb; ++i) {
1426-
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1427-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1425+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1426+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
14281427

14291428
int tmp, tmp2, sumi;
14301429
__asm__ __volatile__(
@@ -1581,9 +1580,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
15811580
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
15821581
q8 += 8; a += 8;
15831582
}
1584-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1583+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
15851584
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1586-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1585+
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
15871586
sumf -= dmin * sumi;
15881587
}
15891588
for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1628,8 +1627,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
16281627
const uint8_t * GGML_RESTRICT hm = x[i].qh;
16291628
const int8_t * GGML_RESTRICT q8 = y[i].qs;
16301629

1631-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1632-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1630+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1631+
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
16331632

16341633
vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
16351634
vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
@@ -1750,9 +1749,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
17501749
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
17511750
q8 += 8; a += 8;
17521751
}
1753-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1752+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
17541753
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1755-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1754+
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
17561755
sumf -= dmin * sumi;
17571756
}
17581757
for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1779,7 +1778,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
17791778

17801779
for (int i = 0; i < nb; ++i) {
17811780

1782-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1781+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
17831782

17841783
const uint8_t * restrict q6 = x[i].ql;
17851784
const uint8_t * restrict qh = x[i].qh;
@@ -1863,7 +1862,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
18631862
case 256:
18641863
for (int i = 0; i < nb; ++i) {
18651864

1866-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1865+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
18671866

18681867
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
18691868
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -1944,7 +1943,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
19441943
case 128:
19451944
for (int i = 0; i < nb; ++i) {
19461945

1947-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1946+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
19481947

19491948
const uint8_t * restrict q6 = x[i].ql;
19501949
const uint8_t * restrict qh = x[i].qh;
@@ -2059,7 +2058,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
20592058
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
20602059
q8 += 8; a += 8;
20612060
}
2062-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2061+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
20632062
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
20642063
}
20652064
for (int l = 0; l < 8; ++l) sumf += sums[l];

0 commit comments

Comments
 (0)