| 
 | 1 | +// Copyright (c) Six Labors.  | 
 | 2 | +// Licensed under the Apache License, Version 2.0.  | 
 | 3 | + | 
 | 4 | +#if SUPPORTS_RUNTIME_INTRINSICS  | 
 | 5 | +using System;  | 
 | 6 | +using System.Runtime.CompilerServices;  | 
 | 7 | +using System.Runtime.InteropServices;  | 
 | 8 | +using System.Runtime.Intrinsics;  | 
 | 9 | +using System.Runtime.Intrinsics.X86;  | 
 | 10 | + | 
 | 11 | +namespace SixLabors.ImageSharp  | 
 | 12 | +{  | 
 | 13 | +    internal static partial class SimdUtils  | 
 | 14 | +    {  | 
 | 15 | +        public static class HwIntrinsics  | 
 | 16 | +        {  | 
 | 17 | +            private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };  | 
 | 18 | + | 
 | 19 | +            /// <summary>  | 
 | 20 | +            /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).  | 
 | 21 | +            /// </summary>  | 
 | 22 | +            [MethodImpl(InliningOptions.ShortMethod)]  | 
 | 23 | +            internal static void ByteToNormalizedFloatReduce(  | 
 | 24 | +                ref ReadOnlySpan<byte> source,  | 
 | 25 | +                ref Span<float> dest)  | 
 | 26 | +            {  | 
 | 27 | +                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");  | 
 | 28 | + | 
 | 29 | +                if (Avx2.IsSupported || Sse2.IsSupported)  | 
 | 30 | +                {  | 
 | 31 | +                    int remainder;  | 
 | 32 | +                    if (Avx2.IsSupported)  | 
 | 33 | +                    {  | 
 | 34 | +                        remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);  | 
 | 35 | +                    }  | 
 | 36 | +                    else  | 
 | 37 | +                    {  | 
 | 38 | +                        remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);  | 
 | 39 | +                    }  | 
 | 40 | + | 
 | 41 | +                    int adjustedCount = source.Length - remainder;  | 
 | 42 | + | 
 | 43 | +                    if (adjustedCount > 0)  | 
 | 44 | +                    {  | 
 | 45 | +                        ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));  | 
 | 46 | + | 
 | 47 | +                        source = source.Slice(adjustedCount);  | 
 | 48 | +                        dest = dest.Slice(adjustedCount);  | 
 | 49 | +                    }  | 
 | 50 | +                }  | 
 | 51 | +            }  | 
 | 52 | + | 
 | 53 | +            /// <summary>  | 
 | 54 | +            /// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.  | 
 | 55 | +            /// </summary>  | 
 | 56 | +            /// <remarks>  | 
 | 57 | +            /// Implementation is based on MagicScaler code:  | 
 | 58 | +            /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182  | 
 | 59 | +            /// </remarks>  | 
 | 60 | +            internal static unsafe void ByteToNormalizedFloat(  | 
 | 61 | +                ReadOnlySpan<byte> source,  | 
 | 62 | +                Span<float> dest)  | 
 | 63 | +            {  | 
 | 64 | +                if (Avx2.IsSupported)  | 
 | 65 | +                {  | 
 | 66 | +                    VerifySpanInput(source, dest, Vector256<byte>.Count);  | 
 | 67 | + | 
 | 68 | +                    int n = dest.Length / Vector256<byte>.Count;  | 
 | 69 | + | 
 | 70 | +                    byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));  | 
 | 71 | + | 
 | 72 | +                    ref Vector256<float> destBase =  | 
 | 73 | +                        ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));  | 
 | 74 | + | 
 | 75 | +                    var scale = Vector256.Create(1 / (float)byte.MaxValue);  | 
 | 76 | + | 
 | 77 | +                    for (int i = 0; i < n; i++)  | 
 | 78 | +                    {  | 
 | 79 | +                        int si = Vector256<byte>.Count * i;  | 
 | 80 | +                        Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);  | 
 | 81 | +                        Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);  | 
 | 82 | +                        Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));  | 
 | 83 | +                        Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));  | 
 | 84 | + | 
 | 85 | +                        Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));  | 
 | 86 | +                        Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));  | 
 | 87 | +                        Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));  | 
 | 88 | +                        Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));  | 
 | 89 | + | 
 | 90 | +                        ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);  | 
 | 91 | + | 
 | 92 | +                        d = f0;  | 
 | 93 | +                        Unsafe.Add(ref d, 1) = f1;  | 
 | 94 | +                        Unsafe.Add(ref d, 2) = f2;  | 
 | 95 | +                        Unsafe.Add(ref d, 3) = f3;  | 
 | 96 | +                    }  | 
 | 97 | +                }  | 
 | 98 | +                else  | 
 | 99 | +                {  | 
 | 100 | +                    // Sse  | 
 | 101 | +                    VerifySpanInput(source, dest, Vector128<byte>.Count);  | 
 | 102 | + | 
 | 103 | +                    int n = dest.Length / Vector128<byte>.Count;  | 
 | 104 | + | 
 | 105 | +                    byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));  | 
 | 106 | + | 
 | 107 | +                    ref Vector128<float> destBase =  | 
 | 108 | +                        ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));  | 
 | 109 | + | 
 | 110 | +                    var scale = Vector128.Create(1 / (float)byte.MaxValue);  | 
 | 111 | +                    Vector128<byte> zero = Vector128<byte>.Zero;  | 
 | 112 | + | 
 | 113 | +                    for (int i = 0; i < n; i++)  | 
 | 114 | +                    {  | 
 | 115 | +                        int si = Vector128<byte>.Count * i;  | 
 | 116 | + | 
 | 117 | +                        Vector128<int> i0, i1, i2, i3;  | 
 | 118 | +                        if (Sse41.IsSupported)  | 
 | 119 | +                        {  | 
 | 120 | +                            i0 = Sse41.ConvertToVector128Int32(sourceBase + si);  | 
 | 121 | +                            i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);  | 
 | 122 | +                            i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));  | 
 | 123 | +                            i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));  | 
 | 124 | +                        }  | 
 | 125 | +                        else  | 
 | 126 | +                        {  | 
 | 127 | +                            Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);  | 
 | 128 | +                            Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();  | 
 | 129 | +                            Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();  | 
 | 130 | + | 
 | 131 | +                            i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();  | 
 | 132 | +                            i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();  | 
 | 133 | +                            i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();  | 
 | 134 | +                            i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();  | 
 | 135 | +                        }  | 
 | 136 | + | 
 | 137 | +                        Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));  | 
 | 138 | +                        Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));  | 
 | 139 | +                        Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));  | 
 | 140 | +                        Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));  | 
 | 141 | + | 
 | 142 | +                        ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);  | 
 | 143 | + | 
 | 144 | +                        d = f0;  | 
 | 145 | +                        Unsafe.Add(ref d, 1) = f1;  | 
 | 146 | +                        Unsafe.Add(ref d, 2) = f2;  | 
 | 147 | +                        Unsafe.Add(ref d, 3) = f3;  | 
 | 148 | +                    }  | 
 | 149 | +                }  | 
 | 150 | +            }  | 
 | 151 | + | 
 | 152 | +            /// <summary>  | 
 | 153 | +            /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).  | 
 | 154 | +            /// </summary>  | 
 | 155 | +            [MethodImpl(InliningOptions.ShortMethod)]  | 
 | 156 | +            internal static void NormalizedFloatToByteSaturateReduce(  | 
 | 157 | +                ref ReadOnlySpan<float> source,  | 
 | 158 | +                ref Span<byte> dest)  | 
 | 159 | +            {  | 
 | 160 | +                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");  | 
 | 161 | + | 
 | 162 | +                if (Avx2.IsSupported || Sse2.IsSupported)  | 
 | 163 | +                {  | 
 | 164 | +                    int remainder;  | 
 | 165 | +                    if (Avx2.IsSupported)  | 
 | 166 | +                    {  | 
 | 167 | +                        remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);  | 
 | 168 | +                    }  | 
 | 169 | +                    else  | 
 | 170 | +                    {  | 
 | 171 | +                        remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);  | 
 | 172 | +                    }  | 
 | 173 | + | 
 | 174 | +                    int adjustedCount = source.Length - remainder;  | 
 | 175 | + | 
 | 176 | +                    if (adjustedCount > 0)  | 
 | 177 | +                    {  | 
 | 178 | +                        NormalizedFloatToByteSaturate(  | 
 | 179 | +                            source.Slice(0, adjustedCount),  | 
 | 180 | +                            dest.Slice(0, adjustedCount));  | 
 | 181 | + | 
 | 182 | +                        source = source.Slice(adjustedCount);  | 
 | 183 | +                        dest = dest.Slice(adjustedCount);  | 
 | 184 | +                    }  | 
 | 185 | +                }  | 
 | 186 | +            }  | 
 | 187 | + | 
 | 188 | +            /// <summary>  | 
 | 189 | +            /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.  | 
 | 190 | +            /// </summary>  | 
 | 191 | +            /// <remarks>  | 
 | 192 | +            /// Implementation is based on MagicScaler code:  | 
 | 193 | +            /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622  | 
 | 194 | +            /// </remarks>  | 
 | 195 | +            internal static void NormalizedFloatToByteSaturate(  | 
 | 196 | +                ReadOnlySpan<float> source,  | 
 | 197 | +                Span<byte> dest)  | 
 | 198 | +            {  | 
 | 199 | +                if (Avx2.IsSupported)  | 
 | 200 | +                {  | 
 | 201 | +                    VerifySpanInput(source, dest, Vector256<byte>.Count);  | 
 | 202 | + | 
 | 203 | +                    int n = dest.Length / Vector256<byte>.Count;  | 
 | 204 | + | 
 | 205 | +                    ref Vector256<float> sourceBase =  | 
 | 206 | +                        ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));  | 
 | 207 | + | 
 | 208 | +                    ref Vector256<byte> destBase =  | 
 | 209 | +                        ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));  | 
 | 210 | + | 
 | 211 | +                    var scale = Vector256.Create((float)byte.MaxValue);  | 
 | 212 | +                    ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);  | 
 | 213 | +                    Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);  | 
 | 214 | + | 
 | 215 | +                    for (int i = 0; i < n; i++)  | 
 | 216 | +                    {  | 
 | 217 | +                        ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);  | 
 | 218 | + | 
 | 219 | +                        Vector256<float> f0 = Avx.Multiply(scale, s);  | 
 | 220 | +                        Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));  | 
 | 221 | +                        Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));  | 
 | 222 | +                        Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));  | 
 | 223 | + | 
 | 224 | +                        Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);  | 
 | 225 | +                        Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);  | 
 | 226 | +                        Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);  | 
 | 227 | +                        Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);  | 
 | 228 | + | 
 | 229 | +                        Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);  | 
 | 230 | +                        Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);  | 
 | 231 | +                        Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);  | 
 | 232 | +                        b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();  | 
 | 233 | + | 
 | 234 | +                        Unsafe.Add(ref destBase, i) = b;  | 
 | 235 | +                    }  | 
 | 236 | +                }  | 
 | 237 | +                else  | 
 | 238 | +                {  | 
 | 239 | +                    // Sse  | 
 | 240 | +                    VerifySpanInput(source, dest, Vector128<byte>.Count);  | 
 | 241 | + | 
 | 242 | +                    int n = dest.Length / Vector128<byte>.Count;  | 
 | 243 | + | 
 | 244 | +                    ref Vector128<float> sourceBase =  | 
 | 245 | +                        ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));  | 
 | 246 | + | 
 | 247 | +                    ref Vector128<byte> destBase =  | 
 | 248 | +                        ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));  | 
 | 249 | + | 
 | 250 | +                    var scale = Vector128.Create((float)byte.MaxValue);  | 
 | 251 | + | 
 | 252 | +                    for (int i = 0; i < n; i++)  | 
 | 253 | +                    {  | 
 | 254 | +                        ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4);  | 
 | 255 | + | 
 | 256 | +                        Vector128<float> f0 = Sse.Multiply(scale, s);  | 
 | 257 | +                        Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));  | 
 | 258 | +                        Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));  | 
 | 259 | +                        Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));  | 
 | 260 | + | 
 | 261 | +                        Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0);  | 
 | 262 | +                        Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1);  | 
 | 263 | +                        Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2);  | 
 | 264 | +                        Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3);  | 
 | 265 | + | 
 | 266 | +                        Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1);  | 
 | 267 | +                        Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3);  | 
 | 268 | + | 
 | 269 | +                        Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);  | 
 | 270 | +                    }  | 
 | 271 | +                }  | 
 | 272 | +            }  | 
 | 273 | +        }  | 
 | 274 | +    }  | 
 | 275 | +}  | 
 | 276 | +#endif  | 
0 commit comments