Skip to content

Commit 79fb927

Browse files
Merge pull request #1398 from SixLabors/js/SimdUtils
Add SimdUtils.HwIntrinsics
2 parents 181e957 + 3383351 commit 79fb927

File tree

7 files changed

+374
-119
lines changed

7 files changed

+374
-119
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs

Lines changed: 0 additions & 103 deletions
This file was deleted.
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
#if SUPPORTS_RUNTIME_INTRINSICS
5+
using System;
6+
using System.Runtime.CompilerServices;
7+
using System.Runtime.InteropServices;
8+
using System.Runtime.Intrinsics;
9+
using System.Runtime.Intrinsics.X86;
10+
11+
namespace SixLabors.ImageSharp
12+
{
13+
internal static partial class SimdUtils
14+
{
15+
public static class HwIntrinsics
16+
{
17+
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
18+
19+
/// <summary>
20+
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
21+
/// </summary>
22+
[MethodImpl(InliningOptions.ShortMethod)]
23+
internal static void ByteToNormalizedFloatReduce(
24+
ref ReadOnlySpan<byte> source,
25+
ref Span<float> dest)
26+
{
27+
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
28+
29+
if (Avx2.IsSupported || Sse2.IsSupported)
30+
{
31+
int remainder;
32+
if (Avx2.IsSupported)
33+
{
34+
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
35+
}
36+
else
37+
{
38+
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
39+
}
40+
41+
int adjustedCount = source.Length - remainder;
42+
43+
if (adjustedCount > 0)
44+
{
45+
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
46+
47+
source = source.Slice(adjustedCount);
48+
dest = dest.Slice(adjustedCount);
49+
}
50+
}
51+
}
52+
53+
/// <summary>
54+
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
55+
/// </summary>
56+
/// <remarks>
57+
/// Implementation is based on MagicScaler code:
58+
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
59+
/// </remarks>
60+
internal static unsafe void ByteToNormalizedFloat(
61+
ReadOnlySpan<byte> source,
62+
Span<float> dest)
63+
{
64+
if (Avx2.IsSupported)
65+
{
66+
VerifySpanInput(source, dest, Vector256<byte>.Count);
67+
68+
int n = dest.Length / Vector256<byte>.Count;
69+
70+
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
71+
72+
ref Vector256<float> destBase =
73+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
74+
75+
var scale = Vector256.Create(1 / (float)byte.MaxValue);
76+
77+
for (int i = 0; i < n; i++)
78+
{
79+
int si = Vector256<byte>.Count * i;
80+
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
81+
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
82+
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
83+
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
84+
85+
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
86+
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
87+
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
88+
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
89+
90+
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
91+
92+
d = f0;
93+
Unsafe.Add(ref d, 1) = f1;
94+
Unsafe.Add(ref d, 2) = f2;
95+
Unsafe.Add(ref d, 3) = f3;
96+
}
97+
}
98+
else
99+
{
100+
// Sse
101+
VerifySpanInput(source, dest, Vector128<byte>.Count);
102+
103+
int n = dest.Length / Vector128<byte>.Count;
104+
105+
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
106+
107+
ref Vector128<float> destBase =
108+
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
109+
110+
var scale = Vector128.Create(1 / (float)byte.MaxValue);
111+
Vector128<byte> zero = Vector128<byte>.Zero;
112+
113+
for (int i = 0; i < n; i++)
114+
{
115+
int si = Vector128<byte>.Count * i;
116+
117+
Vector128<int> i0, i1, i2, i3;
118+
if (Sse41.IsSupported)
119+
{
120+
i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
121+
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
122+
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
123+
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
124+
}
125+
else
126+
{
127+
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
128+
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
129+
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
130+
131+
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
132+
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
133+
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
134+
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
135+
}
136+
137+
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
138+
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
139+
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
140+
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
141+
142+
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
143+
144+
d = f0;
145+
Unsafe.Add(ref d, 1) = f1;
146+
Unsafe.Add(ref d, 2) = f2;
147+
Unsafe.Add(ref d, 3) = f3;
148+
}
149+
}
150+
}
151+
152+
/// <summary>
153+
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
154+
/// </summary>
155+
[MethodImpl(InliningOptions.ShortMethod)]
156+
internal static void NormalizedFloatToByteSaturateReduce(
157+
ref ReadOnlySpan<float> source,
158+
ref Span<byte> dest)
159+
{
160+
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
161+
162+
if (Avx2.IsSupported || Sse2.IsSupported)
163+
{
164+
int remainder;
165+
if (Avx2.IsSupported)
166+
{
167+
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
168+
}
169+
else
170+
{
171+
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
172+
}
173+
174+
int adjustedCount = source.Length - remainder;
175+
176+
if (adjustedCount > 0)
177+
{
178+
NormalizedFloatToByteSaturate(
179+
source.Slice(0, adjustedCount),
180+
dest.Slice(0, adjustedCount));
181+
182+
source = source.Slice(adjustedCount);
183+
dest = dest.Slice(adjustedCount);
184+
}
185+
}
186+
}
187+
188+
/// <summary>
189+
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
190+
/// </summary>
191+
/// <remarks>
192+
/// Implementation is based on MagicScaler code:
193+
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
194+
/// </remarks>
195+
internal static void NormalizedFloatToByteSaturate(
196+
ReadOnlySpan<float> source,
197+
Span<byte> dest)
198+
{
199+
if (Avx2.IsSupported)
200+
{
201+
VerifySpanInput(source, dest, Vector256<byte>.Count);
202+
203+
int n = dest.Length / Vector256<byte>.Count;
204+
205+
ref Vector256<float> sourceBase =
206+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
207+
208+
ref Vector256<byte> destBase =
209+
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
210+
211+
var scale = Vector256.Create((float)byte.MaxValue);
212+
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
213+
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
214+
215+
for (int i = 0; i < n; i++)
216+
{
217+
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
218+
219+
Vector256<float> f0 = Avx.Multiply(scale, s);
220+
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
221+
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
222+
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
223+
224+
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
225+
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
226+
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
227+
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
228+
229+
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
230+
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
231+
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
232+
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
233+
234+
Unsafe.Add(ref destBase, i) = b;
235+
}
236+
}
237+
else
238+
{
239+
// Sse
240+
VerifySpanInput(source, dest, Vector128<byte>.Count);
241+
242+
int n = dest.Length / Vector128<byte>.Count;
243+
244+
ref Vector128<float> sourceBase =
245+
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
246+
247+
ref Vector128<byte> destBase =
248+
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));
249+
250+
var scale = Vector128.Create((float)byte.MaxValue);
251+
252+
for (int i = 0; i < n; i++)
253+
{
254+
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
255+
256+
Vector128<float> f0 = Sse.Multiply(scale, s);
257+
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
258+
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
259+
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
260+
261+
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0);
262+
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1);
263+
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2);
264+
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3);
265+
266+
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1);
267+
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3);
268+
269+
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
270+
}
271+
}
272+
}
273+
}
274+
}
275+
}
276+
#endif

0 commit comments

Comments
 (0)