Skip to content

Commit 18bee53

Browse files
committed
Take #593 into account.
1 parent 098833b commit 18bee53

File tree

2 files changed

+37
-25
lines changed

2 files changed

+37
-25
lines changed

libc-top-half/musl/src/string/memchr.c

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,34 @@
1515
void *memchr(const void *src, int c, size_t n)
1616
{
1717
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
18+
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
19+
// which results in an ICE when inline assembly is used with a vector result.
20+
#if __clang_major__ != 19 && __clang_major__ != 20
1821
// When n is zero, a function that locates a character finds no occurrence.
1922
// Otherwise, decrement n to ensure sub_overflow overflows
2023
// when n would go equal-to-or-below zero.
2124
if (!n--) {
2225
return NULL;
2326
}
2427

25-
// memchr must behave as if it reads characters sequentially
26-
// and stops as soon as a match is found.
27-
// Aligning ensures loads beyond the first match are safe.
28-
// Casting through uintptr_t makes this implementation-defined,
29-
// rather than undefined behavior.
28+
// Note that reading before/after the allocation of a pointer is UB in
29+
// C, so inline assembly is used to generate the exact machine
30+
// instruction we want with opaque semantics to the compiler to avoid
31+
// the UB.
3032
uintptr_t align = (uintptr_t)src % sizeof(v128_t);
31-
const v128_t *v = (v128_t *)((uintptr_t)src - align);
32-
const v128_t vc = wasm_i8x16_splat(c);
33+
uintptr_t addr = (uintptr_t)src - align;
34+
v128_t vc = wasm_i8x16_splat(c);
3335

3436
for (;;) {
35-
const v128_t cmp = wasm_i8x16_eq(*v, vc);
37+
v128_t v;
38+
__asm__ (
39+
"local.get %1\n"
40+
"v128.load 0\n"
41+
"local.set %0\n"
42+
: "=r"(v)
43+
: "r"(addr)
44+
: "memory");
45+
v128_t cmp = wasm_i8x16_eq(v, vc);
3646
// Bitmask is slow on AArch64, any_true is much faster.
3747
if (wasm_v128_any_true(cmp)) {
3848
// Clear the bits corresponding to align (little-endian)
@@ -48,16 +58,18 @@ void *memchr(const void *src, int c, size_t n)
4858
// That's a match, unless it is beyond the end of the object.
4959
// Recall that we decremented n, so less-than-or-equal-to is correct.
5060
size_t ctz = __builtin_ctz(mask);
51-
return ctz - align <= n ? (char *)v + ctz : NULL;
61+
return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
62+
: NULL;
5263
}
5364
}
5465
// Decrement n; if it overflows we're done.
5566
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
5667
return NULL;
5768
}
5869
align = 0;
59-
v++;
70+
addr += sizeof(v128_t);
6071
}
72+
#endif
6173
#endif
6274

6375
const unsigned char *s = src;

libc-top-half/musl/src/string/strlen.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,28 @@
1414
size_t strlen(const char *s)
1515
{
1616
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
17-
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
18-
// results in an ICE when inline assembly is used with a vector result.
17+
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
18+
// which results in an ICE when inline assembly is used with a vector result.
1919
#if __clang_major__ != 19 && __clang_major__ != 20
20-
// Note that reading before/after the allocation of a pointer is UB in
21-
// C, so inline assembly is used to generate the exact machine
22-
// instruction we want with opaque semantics to the compiler to avoid
23-
// the UB.
20+
// Note that reading before/after the allocation of a pointer is UB in
21+
// C, so inline assembly is used to generate the exact machine
22+
// instruction we want with opaque semantics to the compiler to avoid
23+
// the UB.
2424
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
25-
uintptr_t v = (uintptr_t)s - align;
25+
uintptr_t addr = (uintptr_t)s - align;
2626

2727
for (;;) {
28-
v128_t chunk;
28+
v128_t v;
2929
__asm__ (
3030
"local.get %1\n"
3131
"v128.load 0\n"
3232
"local.set %0\n"
33-
: "=r"(chunk)
34-
: "r"(v)
35-
: "memory");
33+
: "=r"(v)
34+
: "r"(addr)
35+
: "memory");
3636
// Bitmask is slow on AArch64, all_true is much faster.
37-
if (!wasm_i8x16_all_true(chunk)) {
38-
const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
37+
if (!wasm_i8x16_all_true(v)) {
38+
const v128_t cmp = wasm_i8x16_eq(v, (v128_t){});
3939
// Clear the bits corresponding to align (little-endian)
4040
// so we can count trailing zeros.
4141
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -46,11 +46,11 @@ size_t strlen(const char *s)
4646
// it's as if we didn't find anything.
4747
if (mask) {
4848
// Find the offset of the first one bit (little-endian).
49-
return v - (uintptr_t)s + __builtin_ctz(mask);
49+
return addr - (uintptr_t)s + __builtin_ctz(mask);
5050
}
5151
}
5252
align = 0;
53-
v += sizeof(v128_t);
53+
addr += sizeof(v128_t);
5454
}
5555
#endif
5656
#endif

0 commit comments

Comments
 (0)