Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/util/simd/Local.mk
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
$(call add-hdrs,fd_sse.h fd_sse_vc.h fd_sse_vi.h fd_sse_vu.h fd_sse_vf.h fd_sse_vl.h fd_sse_vv.h fd_sse_vd.h fd_sse_vl.h)
$(call add-hdrs,fd_sse.h fd_sse_vc.h fd_sse_vi.h fd_sse_vu.h fd_sse_vf.h fd_sse_vl.h fd_sse_vv.h fd_sse_vd.h fd_sse_vl.h fd_sse_vb.h)
$(call make-unit-test,test_sse_4x32,test_sse_4x32 test_sse_common,fd_util)
$(call make-unit-test,test_sse_2x64,test_sse_2x64 test_sse_common,fd_util)
$(call make-unit-test,test_sse_16x8,test_sse_16x8 test_sse_common,fd_util)
$(call run-unit-test,test_sse_4x32,)
$(call run-unit-test,test_sse_2x64,)
$(call run-unit-test,test_sse_16x8,)

$(call add-hdrs,fd_avx.h fd_avx_wc.h fd_avx_wi.h fd_avx_wu.h fd_avx_wf.h fd_avx_wl.h fd_avx_wv.h fd_avx_wd.h fd_avx_wl.h)
$(call add-hdrs,fd_avx.h fd_avx_wc.h fd_avx_wi.h fd_avx_wu.h fd_avx_wf.h fd_avx_wl.h fd_avx_wv.h fd_avx_wd.h fd_avx_wl.h fd_avx_wb.h)
$(call make-unit-test,test_avx_8x32,test_avx_8x32 test_avx_common,fd_util)
$(call make-unit-test,test_avx_4x64,test_avx_4x64 test_avx_common,fd_util)
$(call make-unit-test,test_avx_32x8,test_avx_32x8 test_avx_common,fd_util)
$(call run-unit-test,test_avx_8x32,)
$(call run-unit-test,test_avx_4x64,)
$(call run-unit-test,test_avx_32x8,)

1 change: 1 addition & 0 deletions src/util/simd/fd_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "fd_avx_wd.h" /* Vector double support */
#include "fd_avx_wl.h" /* Vector long support */
#include "fd_avx_wv.h" /* Vector ulong support */
#include "fd_avx_wb.h" /* Vector uchar (byte) support */

#else
#error "Build target does not support AVX wrappers"
Expand Down
317 changes: 317 additions & 0 deletions src/util/simd/fd_avx_wb.h

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/util/simd/fd_sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "fd_sse_vd.h" /* Vector double support */
#include "fd_sse_vl.h" /* Vector long support */
#include "fd_sse_vv.h" /* Vector ulong support */
#include "fd_sse_vb.h" /* Vector uchar (byte) support */

#else
#error "Build target does not support SSE wrappers"
Expand Down
274 changes: 274 additions & 0 deletions src/util/simd/fd_sse_vb.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
#ifndef HEADER_fd_src_util_simd_fd_sse_h
#error "Do not include this directly; use fd_sse.h"
#endif

/* Vector byte API *****************************************************/

/* A vb_t is a vector where each 8-bit wide lane holds an unsigned 8-bit
unsigned char (a "uchar").

These mirror the other APIs as much as possible. Macros are
preferred over static inlines when it is possible to do it robustly
to reduce the risk of the compiler mucking it up. */

#define vb_t __m128i

/* Constructors */

/* Given the uchar values, return ... */

#define vb(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,b11,b12,b13,b14,b15 ) /* [ b0 b1 b2 b3 b4 ... b15 ] */ \
_mm_setr_epi8( (char)( b0), (char)( b1), (char)( b2), (char)( b3), (char)( b4), (char)( b5), (char)( b6), (char)( b7), \
(char)( b8), (char)( b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15) )

#define vb_bcast(b0) _mm_set1_epi8( (char)(b0) ) /* [ b0 b0 b0 b0 ... b0 ] */

static inline vb_t /* [ b0 b1 b0 b1 b0 b1 ... b0 b1 ] */
vb_bcast_pair( uchar b0, uchar b1 ) {
return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
(char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1) );
}

static inline vb_t /* [ b0 b0 b0 ... b0 b1 b1 b1 ... b1 ] */
vb_bcast_lohi( uchar b0, uchar b1 ) {
return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
(char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1) );
}

static inline vb_t /* [ b0 b1 b2 b3 b0 b1 b2 b3 ... b0 b1 b2 b3 ] */
vb_bcast_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
(char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3) );
}

static inline vb_t /* [ b0 b0 b1 b1 b2 b2 b3 b3 ] */
vb_bcast_wide( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
return _mm_setr_epi8( (char)( b0), (char)( b0), (char)( b1), (char)( b1), (char)( b2), (char)( b2), (char)( b3), (char)( b3),
(char)( b4), (char)( b4), (char)( b5), (char)( b5), (char)( b6), (char)( b6), (char)( b7), (char)( b7) );
}

/* vb_permute returns [ x[i0], x[i1], ... x[i15] ].
Other useful cases are provided below. Given [ b0 b1 b2 b3 b4 ... b15 ],
return ... */
#define vb_permute(x,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15) \
_mm_shuffle_epi8( (x), vb( (i0), (i1), (i2), (i3), (i4), (i5), (i6), (i7), (i8), (i9), (i10), (i11), (i12), (i13), (i14), (i15) ) )

#define vb_bcast_even(x) /* [ b0 b0 b2 b2 b4 b4 .. b12 b12 b14 b14 ] */ \
_mm_shuffle_epi8( (x), vb_bcast_wide( 0, 2, 4, 6, 8, 10, 12, 14 ) )

#define vb_bcast_odd(x) /* [ b1 b1 b3 b3 b5 b5 .. b13 b13 b15 b15 ] */ \
_mm_shuffle_epi8( (x), vb_bcast_wide( 1, 3, 5, 7, 9, 11, 13, 15 ) )

#define vb_exch_adj(x) /* [ b1 b0 b3 b2 ... b15 b14 ] */ \
_mm_shuffle_epi8( (x), vb( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ) )

#define vb_exch_adj_pair(x) /* [ b2 b3 b0 b1 .. b14 b15 b12 b13 ] */ \
_mm_shuffle_epi8( (x), vb( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) )

#define vb_exch_adj_quad(x) /* [ b4 b5 b6 b7 b0 b1 b2 b3 .. b8 b9 b10 b11 ] */ \
_mm_shuffle_epi8( (x), vb( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 ) )
/* Predefined constants */

#define vb_zero() _mm_setzero_si128() /* Return [ 0 0 0 0 0 ... 0 0 ] */
#define vb_one() _mm_set1_epi8( 1 ) /* Return [ 1 1 1 1 1 ... 1 1 ] */

/* Memory operations */

/* vb_ld return the 16 uchars at the 16-byte aligned / 16-byte sized
location p as a vector uchar. vb_ldu is the same but p does not have
to be aligned. vb_st writes the vector uchar to the 16-byte aligned /
16-byte sized location p as 16 uchars. vb_stu is the same but p does not
have to be aligned. In all these lane l will be at p[l].

Note: gcc knows a __m128i may alias. */

/* FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */

static inline vb_t vb_ld( uchar const * p ) { return _mm_load_si128( (__m128i const *)p ); }
static inline vb_t vb_ldu( uchar const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
static inline void vb_st( uchar * p, vb_t i ) { _mm_store_si128( (__m128i *)p, i ); }
static inline void vb_stu( uchar * p, vb_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }

/* Sadly, no maskload_epi8, so we can't provide a vb_ldif or vb_stif */


/* Element operations */

/* vb_extract extracts the uchar in lane imm from the vector uchar as an uchar.
vb_insert returns the vector uchar formed by replacing the value in
lane imm of a vbth the provided uchar. imm should be a compile time
constant in 0:7. vb_extract_variable and vb_insert_variable are the
slower but the lane n does not have to be known at compile time
(should still be in 0:7).

Note: C99 TC3 allows type punning through a union. */

#define vb_extract(a,imm) ((uchar)_mm_extract_epi8( (a), (imm) ))
#define vb_insert(a,imm,v) _mm_insert_epi8( (a), (v), (imm) )

static inline uchar
vb_extract_variable( vb_t a, int n ) {
union { __m128i m[1]; uchar i[16]; } t[1];
_mm_store_si128( t->m, a );
return t->i[n];
}

static inline vb_t
vb_insert_variable( vb_t a, int n, uchar v ) {
union { __m128i m[1]; uchar i[16]; } t[1];
_mm_store_si128( t->m, a );
t->i[n] = v;
return _mm_load_si128( t->m );
}

/* Given [a0 a1 a2 a3 a4 ... a14 a15] and/or [b0 b1 b2 b3 b4 ... b14 b15],
return ... */

/* Arithmetic operations */

#define vb_neg(a) _mm_sub_epi8( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ... -a15 ] (twos complement handling) */
#define vb_abs(a) (a) /* [ |a0| |a1| ... |a15| ] (unsigned type, so identity) */

#define vb_min(a,b) _mm_min_epu8( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a15,b15) ] */
#define vb_max(a,b) _mm_max_epu8( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a15,b15) ] */
#define vb_add(a,b) _mm_add_epi8( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a15 +b15 ] */
#define vb_sub(a,b) _mm_sub_epi8( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a15 -b15 ] */
/* No vb_mul because there's no instruction for multiplying uchars. You
can build one with two invocations to _mm_mullo_epi16, but it won't
be particularly fast. Multiplication by add and shift might be
faster honestly. */

/* Binary operations */

/* Note: vb_shl/vb_shr/vb_shru is a left/signed right/unsigned right
shift by imm bits; imm must be a compile time constant in 0:63. The
variable variants are slower but do not require the shift amount to
be known at compile time (should still be in 0:63). */

#define vb_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a15 ] */


#define vb_shl(a,imm) vb_and( _mm_slli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL << (imm)) ) ) /* [ a0<<imm a1<<imm ... a15<<imm ] */
/* It's not easy to implement the full set of shifts given the
intrinsics. Could convert to 16b, do the shift, and convert back, but
that's terrible. */
#define vb_shru(a,imm) vb_and( _mm_srli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL >> (imm)) ) ) /* [ a0>>imm a1>>imm ... a15>>imm ] (treat a as unsigned) */

#define vb_shl_variable(a,n) vb_and( _mm_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
vb_bcast( (uchar)(0xFFUL << (n)) ) )
#define vb_shru_variable(a,n) vb_and( _mm_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
vb_bcast( (uchar)(0xFFUL >> (n)) ) )


#define vb_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ... a15& b15 ] */
#define vb_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a15)&b15 ] */
#define vb_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a15 |b15 ] */
#define vb_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a15 ^b15 ] */

/* Logical operations */

/* NOTE: These don't return proper wc_t values, but they are usable for
* vb_czero, vb_notczero, vb_if, vb_any_fast, and vb_all_fast. */

#define vb_lnot(a) _mm_cmpeq_epi8( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ... !a15 ] */
#define vb_lnotnot(a) /* [ !!a0 !!a1 ... !!a15 ] */ \
_mm_xor_si128( _mm_set1_epi32( -1 ), vb_lnot( (a) ) )


#define vb_eq(a,b) _mm_cmpeq_epi8( (a), (b) ) /* [ a0==b0 a1==b1 ... a15==b15 ] */
#define vb_gt(a,b) /* [ a0> b0 a1> b1 ... a15> b15 ] */\
_mm_cmpgt_epi8( _mm_sub_epi8( (a), _mm_set1_epi8( (char)(1U<<7) ) ), \
_mm_sub_epi8( (b), _mm_set1_epi8( (char)(1U<<7) ) ) )
#define vb_lt(a,b) vb_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a15< b15 ] */
#define vb_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi8( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */
#define vb_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a15>=b15 ] */
#define vb_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a15<=b15 ] */

/* Conditional operations */

#define vb_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c15? 0:f15 ] */
#define vb_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c15?f15: 0 ] */

#define vb_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c15?t15:f15 ] */

/* Conversion operations */

/* Summarizing:

vb_to_vc(a, 0) returns [ !!a0 !!a1 !!a2 !!a3 ]
vb_to_vc(a, 1) returns [ !!a4 !!a5 !!a6 !!a7 ]
vb_to_vc(a, 2) returns [ !!a8 !!a9 !!a10 !!a11 ]
vb_to_vc(a, 3) returns [ !!a12 !!a13 !!a14 !!a15 ]

vb_to_vf(a, 0) returns [ (float)a0 (float)a1 (float)a2 (float)a3 ]
vb_to_vf(a, 1) returns [ (float)a4 (float)a5 (float)a6 (float)a7 ]
vb_to_vf(a, 2) returns [ (float)a8 (float)a9 (float)a10 (float)a11 ]
vb_to_vf(a, 3) returns [ (float)a12 (float)a13 (float)a14 (float)a15 ]

vb_to_vi(a, 0) returns [ (int)a0 (int)a1 (int)a2 (int)a3 ]
vb_to_vi(a, 1) returns [ (int)a4 (int)a5 (int)a6 (int)a7 ]
vb_to_vi(a, 2) returns [ (int)a8 (int)a9 (int)a10 (int)a11 ]
vb_to_vi(a, 3) returns [ (int)a12 (int)a13 (int)a14 (int)a15 ]


vb_to_vd(a,0) returns [ (double)a0 (double)a1 ]
vb_to_vd(a,1) returns [ (double)a2 (double)a3 ]
...
vb_to_vd(a,7) returns [ (double)a14 (double)a15 ]

vb_to_vl(a,0) returns [ (long)a0 (long)a1 ]
vb_to_vl(a,1) returns [ (long)a2 (long)a3 ]
...
vb_to_vl(a,7) returns [ (long)a14 (long)a15 ]

where imm_hi should be a compile time constant. */


#define vb_to_vc(a, imm_hi) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm_hi) ) ) , _mm_setzero_si128() ) )
#define vb_to_vi(a, imm_hi) _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm_hi) ) )
#define vb_to_vu(a, imm_hi) _mm_cvtepu8_epi32( _mm_bsrli( (a), 4*(imm_hi) ) )
#define vb_to_vf(a, imm_hi) _mm_cvtepi32_ps( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm_hi) ) ) )
#define vb_to_vd(a, imm_hi) _mm_cvtepi32_pd( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 2*(imm_hi) ) ) )
#define vb_to_vl(a, imm_hi) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm_hi) ) )
#define vb_to_vv(a, imm_hi) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm_hi) ) )

#define vb_to_vc_raw(a) (a)
#define vb_to_vf_raw(a) _mm_castsi128_ps( (a) )
#define vb_to_vd_raw(a) _mm_castsi128_pd( (a) )
#define vb_to_vl_raw(a) (a)

/* Reduction operations */

static inline vb_t
vb_sum_all( vb_t x ) { /* Returns vb_bcast( sum( x ) ) */
x = _mm_sad_epu8( x, _mm_setzero_si128() ); /* x[0-7] x[8-15] (each stored in 64 bits) */
return _mm_add_epi8( _mm_shuffle_epi8( x, vb_bcast( 0 ) ) , _mm_shuffle_epi8( x, vb_bcast( 8 ) ) ); /* Grab the low byte of each sum, broadcast it, then sum */
}

static inline vb_t
vb_min_all( vb_t x ) { /* Returns vb_bcast( min( x ) ) */
x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8 x1,9 .. x7,15 (repeats 1 more time) */
x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12 .. x3,7,11,15 (repeats 3 more times)*/
x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
return x;
}

static inline vb_t
vb_max_all( vb_t x ) { /* Returns vb_bcast( max( x ) ) */
x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8 x1,9 .. x7,15 (repeats 1 more time) */
x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12 .. x3,7,11,15 (repeats 3 more times)*/
x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
return x;
}

/* Misc operations */

/* vb_{any, all} return 1 if any/all of the elements are non-zero. The
_fast variants are suitable for use with the return value of any of
the vb comparison functions (e.g. vb_gt ). */
#define vb_any_fast( x ) ( 0 != _mm_movemask_epi8( x ) )
#define vb_any( x ) vb_any_fast( vb_ne( (x), vb_zero( ) ) )
#define vb_all_fast( x ) ( 0xFFFF == _mm_movemask_epi8( x ) )
#define vb_all( x ) vb_all_fast( vb_ne( (x), vb_zero( ) ) )
Loading