1414#define GGML_SYCL_VECDOTQ_HPP
1515
1616#include " dpct/helper.hpp"
17+ #include " syclcompat/math.hpp"
1718
1819typedef float (*vec_dot_q_sycl_t )(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
1920
@@ -89,14 +90,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
8990 const int vi = (v >> (2 *i)) & 0x03030303 ;
9091
9192 sumf_d +=
92- d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
93+ d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
9394
9495 // fill int with 4x m
9596 int m = sc >> 4 ;
9697 m |= m << 8 ;
9798 m |= m << 16 ;
9899 sumf_m += d8[i] *
99- dpct ::dp4a (
100+ syclcompat ::dp4a (
100101 m, u[i],
101102 0 ); // multiply constant q2_K part with sum of q8_1 values
102103 }
@@ -139,7 +140,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
139140 const int vi =
140141 dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat ());
141142
142- sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
143+ sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
143144 }
144145
145146 return d3 * sumf;
@@ -162,11 +163,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
162163 const int v1i = (v[1 ] >> (4 *i)) & 0x0F0F0F0F ;
163164
164165 const int dot1 =
165- dpct ::dp4a (v1i, u[2 * i + 1 ],
166- dpct ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
166+ syclcompat ::dp4a (v1i, u[2 * i + 1 ],
167+ syclcompat ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
167168 const int dot2 =
168- dpct ::dp4a (0x01010101 , u[2 * i + 1 ],
169- dpct ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
169+ syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ],
170+ syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
170171
171172 sumf_d += d8[i] * (dot1 * sc[i]);
172173 sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
@@ -203,11 +204,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
203204 const int v1i = vl1i | vh1i;
204205
205206 const int dot1 =
206- dpct ::dp4a (v0i, u[2 * i + 0 ],
207- dpct ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
207+ syclcompat ::dp4a (v0i, u[2 * i + 0 ],
208+ syclcompat ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
208209 const int dot2 =
209- dpct ::dp4a (0x01010101 , u[2 * i + 0 ],
210- dpct ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
210+ syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ],
211+ syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
211212
212213 sumf_d += d8[i] * (dot1 * sc[i]);
213214 sumf_m += d8[i] * (dot2 * m[i]);
@@ -243,7 +244,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
243244 const int vi = dpct::vectorized_binary<sycl::char4>(
244245 (vil | vih), 0x20202020 , dpct::sub_sat ()); // vi = (vil | vih) - 32
245246
246- sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
247+ sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
247248 }
248249
249250 return d*sumf;
@@ -266,8 +267,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
266267 const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
267268
268269 // SIMD dot product of quantized values
269- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
270- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
270+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
271+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
271272 }
272273
273274 const sycl::float2 ds8f =
@@ -293,8 +294,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
293294 const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
294295
295296 // SIMD dot product of quantized values
296- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
297- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
297+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
298+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
298299 }
299300
300301#ifdef GGML_SYCL_F16
@@ -331,15 +332,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
331332 vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
332333 vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
333334 vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
334- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
335+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
335336 sumi); // SIMD dot product of quantized values
336337
337338 int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
338339 vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
339340 vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
340341 vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
341342 vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
342- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
343+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
343344 sumi); // SIMD dot product of quantized values
344345 }
345346
@@ -367,15 +368,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
367368 vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
368369 vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
369370 vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
370- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
371+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
371372 sumi); // SIMD dot product of quantized values
372373
373374 int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
374375 vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
375376 vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
376377 vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
377378 vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
378- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
379+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
379380 sumi); // SIMD dot product of quantized values
380381 }
381382
@@ -412,7 +413,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
412413#pragma unroll
413414 for (int i = 0 ; i < vdr; ++i) {
414415 // SIMD dot product of quantized values
415- sumi = dpct ::dp4a (v[i], u[i], sumi);
416+ sumi = syclcompat ::dp4a (v[i], u[i], sumi);
416417 }
417418
418419 return d8_0*d8_1 * sumi;
@@ -428,7 +429,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
428429#pragma unroll
429430 for (int i = 0 ; i < vdr; ++i) {
430431 // SIMD dot product of quantized values
431- sumi = dpct ::dp4a (v[i], u[i], sumi);
432+ sumi = syclcompat ::dp4a (v[i], u[i], sumi);
432433 }
433434
434435#ifdef GGML_SYCL_F16
@@ -677,10 +678,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
677678 const int v1 = q4[0 ];
678679 const int v2 = q4[4 ];
679680
680- const int dot1 = dpct ::dp4a (ui2, v2 & 0x0f0f0f0f , dpct ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
681- const int dot2 = dpct ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , dpct ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
682- const int dot3 = dpct ::dp4a (0x01010101 , ui2, dpct ::dp4a (0x01010101 , ui1, 0 ));
683- const int dot4 = dpct ::dp4a (0x01010101 , ui4, dpct ::dp4a (0x01010101 , ui3, 0 ));
681+ const int dot1 = syclcompat ::dp4a (ui2, v2 & 0x0f0f0f0f , syclcompat ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
682+ const int dot2 = syclcompat ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , syclcompat ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
683+ const int dot3 = syclcompat ::dp4a (0x01010101 , ui2, syclcompat ::dp4a (0x01010101 , ui1, 0 ));
684+ const int dot4 = syclcompat ::dp4a (0x01010101 , ui4, syclcompat ::dp4a (0x01010101 , ui3, 0 ));
684685
685686 sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
686687 sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
@@ -772,8 +773,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
772773 const int v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
773774 const int v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
774775
775- const float sumf_d = d8_1 * (dpct ::dp4a (ui1, v1, 0 ) * s[0 ] + dpct ::dp4a (ui2, v2, 0 ) * s[1 ])
776- + d8_2 * (dpct ::dp4a (ui3, v3, 0 ) * s[2 ] + dpct ::dp4a (ui4, v4, 0 ) * s[3 ]);
776+ const float sumf_d = d8_1 * (syclcompat ::dp4a (ui1, v1, 0 ) * s[0 ] + syclcompat ::dp4a (ui2, v2, 0 ) * s[1 ])
777+ + d8_2 * (syclcompat ::dp4a (ui3, v3, 0 ) * s[2 ] + syclcompat ::dp4a (ui4, v4, 0 ) * s[3 ]);
777778
778779 return d * sumf_d;
779780
@@ -865,8 +866,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
865866 grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
866867 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
867868 grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
868- sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
869- sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
869+ sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
870+ sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
870871 q8 += 8 ;
871872 }
872873 int sumi2 = 0 ;
@@ -877,8 +878,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
877878 grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
878879 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
879880 grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
880- sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
881- sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
881+ sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
882+ sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
882883 q8 += 8 ;
883884 }
884885 const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -917,8 +918,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
917918 grid[0 ] ^ signs0, signs0, std::minus<>());
918919 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
919920 grid[1 ] ^ signs1, signs1, std::minus<>());
920- sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
921- sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
921+ sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
922+ sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
922923 q8 += 8 ;
923924 }
924925 int sumi2 = 0 ;
@@ -934,8 +935,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
934935 grid[0 ] ^ signs0, signs0, std::minus<>());
935936 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
936937 grid[1 ] ^ signs1, signs1, std::minus<>());
937- sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
938- sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
938+ sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
939+ sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
939940 q8 += 8 ;
940941 }
941942 const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -968,8 +969,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
968969 grid1[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
969970 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
970971 grid2[0 ] ^ signs[1 ], signs[1 ], std::minus<>());
971- sumi = dpct ::dp4a (grid_l, *((int *)q8 + 0 ), sumi);
972- sumi = dpct ::dp4a (grid_h, *((int *)q8 + 1 ), sumi);
972+ sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
973+ sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
973974 q8 += 8 ;
974975 aux32 >>= 7 ;
975976 }
@@ -1009,8 +1010,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
10091010 grid1[0 ] ^ signs0, signs0, std::minus<>());
10101011 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
10111012 grid2[0 ] ^ signs1, signs1, std::minus<>());
1012- sumi = dpct ::dp4a (grid_l, *((int *)q8 + 0 ), sumi);
1013- sumi = dpct ::dp4a (grid_h, *((int *)q8 + 1 ), sumi);
1013+ sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
1014+ sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
10141015 q8 += 8 ;
10151016 }
10161017 const float d =
@@ -1037,8 +1038,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
10371038 const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [ib32] >> 3 *l) & 7 ) << 8 )));
10381039 int grid0 = grid[0 ] & 0x0f0f0f0f ;
10391040 int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1040- sumi = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1041- dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi));
1041+ sumi = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1042+ syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi));
10421043 }
10431044
10441045 const float delta = bq1->qh [ib32] & 0x8000 ? -1 -IQ1S_DELTA : -1 +IQ1S_DELTA;
@@ -1066,11 +1067,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
10661067 const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 7 ) << 8 )));
10671068 int grid0 = grid[0 ] & 0x0f0f0f0f ;
10681069 int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1069- sumi[l / 2 ] = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1070- dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
1070+ sumi[l / 2 ] = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1071+ syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
10711072 const float delta = (bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 0x08 ? -1 -IQ1M_DELTA : -1 +IQ1M_DELTA;
1072- const int sumy = dpct ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1073- dpct ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
1073+ const int sumy = syclcompat ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1074+ syclcompat ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
10741075 sumf[l/2 ] += delta*sumy;
10751076 }
10761077
@@ -1101,8 +1102,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
11011102 for (int l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
11021103 const uint32_t aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
11031104 get_int_from_table_16 (aux, values, v1, v2);
1104- sumi1 = dpct ::dp4a (v1, q8[l + 0 ], sumi1);
1105- sumi2 = dpct ::dp4a (v2, q8[l + 4 ], sumi2);
1105+ sumi1 = syclcompat ::dp4a (v1, q8[l + 0 ], sumi1);
1106+ sumi2 = syclcompat ::dp4a (v2, q8[l + 4 ], sumi2);
11061107 }
11071108
11081109 const float d = (float )bq->d * bq8_1->ds [0 ];
@@ -1128,8 +1129,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
11281129 int sumi1 = 0 , sumi2 = 0 ;
11291130 for (int j = 0 ; j < 4 ; ++j) {
11301131 get_int_from_table_16 (q4[j], values, v1, v2);
1131- sumi1 = dpct ::dp4a (v1, q8[j + 0 ], sumi1);
1132- sumi2 = dpct ::dp4a (v2, q8[j + 4 ], sumi2);
1132+ sumi1 = syclcompat ::dp4a (v1, q8[j + 0 ], sumi1);
1133+ sumi2 = syclcompat ::dp4a (v2, q8[j + 4 ], sumi2);
11331134 }
11341135 return d * (sumi1 + sumi2);
11351136#else
0 commit comments