1- /* auto-generated on 2025-01-27 20:34:35 -0500. Do not edit! */
1+ /* auto-generated on 2025-02-14 16:11:36 -0500. Do not edit! */
22/* including simdjson.h: */
33/* begin file simdjson.h */
44#ifndef SIMDJSON_H
@@ -2437,7 +2437,7 @@ namespace std {
24372437#define SIMDJSON_SIMDJSON_VERSION_H
24382438
24392439/** The version of simdjson being used (major.minor.revision) */
2440- #define SIMDJSON_VERSION "3.12.0 "
2440+ #define SIMDJSON_VERSION "3.12.2 "
24412441
24422442namespace simdjson {
24432443enum {
@@ -2452,7 +2452,7 @@ enum {
24522452 /**
24532453 * The revision (major.minor.REVISION) of simdjson being used.
24542454 */
2455- SIMDJSON_VERSION_REVISION = 0
2455+ SIMDJSON_VERSION_REVISION = 2
24562456};
24572457} // namespace simdjson
24582458
@@ -17948,14 +17948,18 @@ namespace simd {
1794817948
1794917949 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
1795017950 // Passing a 0 value for mask would be equivalent to writing out every byte to output.
17951- // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
17951+ // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes
1795217952 // get written.
1795317953 // Design consideration: it seems like a function with the
1795417954 // signature simd8<L> compress(uint32_t mask) would be
1795517955 // sensible, but the AVX ISA makes this kind of approach difficult.
1795617956 template<typename L>
1795717957 simdjson_inline void compress(uint64_t mask, L * output) const {
17958- _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
17958+ // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability
17959+ // (AMD Zen4 has terrible performance with it, it is effectively broken)
17960+ // _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
17961+ __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this);
17962+ _mm512_storeu_si512(output, compressed); // could use a mask
1795917963 }
1796017964
1796117965 template<typename L>
@@ -65401,14 +65405,18 @@ namespace simd {
6540165405
6540265406 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
6540365407 // Passing a 0 value for mask would be equivalent to writing out every byte to output.
65404- // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
65408+ // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes
6540565409 // get written.
6540665410 // Design consideration: it seems like a function with the
6540765411 // signature simd8<L> compress(uint32_t mask) would be
6540865412 // sensible, but the AVX ISA makes this kind of approach difficult.
6540965413 template<typename L>
6541065414 simdjson_inline void compress(uint64_t mask, L * output) const {
65411- _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
65415+ // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability
65416+ // (AMD Zen4 has terrible performance with it, it is effectively broken)
65417+ // _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
65418+ __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this);
65419+ _mm512_storeu_si512(output, compressed); // could use a mask
6541265420 }
6541365421
6541465422 template<typename L>
0 commit comments