Skip to content

[API Proposal]: AVX-512 VPOPCNTDQ and BITALG Intrinsics #96162

@MineCake147E

Description

@MineCake147E

Background and motivation

Both VPOPCNTDQ and BITALG are supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4.
VPOPCNTDQ allows for parallel popcnt in either Vector128, Vector256, or Vector512 for ulong and uint.
BITALG expands parallel popcnt for ushort and byte, and it also adds VPSHUFBITQMB instruction, which performs a bit gather select.
VPOPCNTW is highly beneficial for my current project, allowing me for counting filled blocks row-by-row for a bit-board of block games.

API Proposal

namespace System.Runtime.Intrinsics.X86;

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512VPopcntDQ : Avx512DQ
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512DQ.X64
    {
        internal X64() { }
        public static new bool IsSupported { get; }
    }

    public static Vector512<int> PopCount(Vector512<int> value);

    public static Vector512<uint> PopCount(Vector512<uint> value);

    public static Vector512<long> PopCount(Vector512<long> value);

    public static Vector512<ulong> PopCount(Vector512<ulong> value);

    public abstract class VL : Avx512DQ.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<int> PopCount(Vector256<int> value);

        public static Vector256<uint> PopCount(Vector256<uint> value);

        public static Vector256<long> PopCount(Vector256<long> value);

        public static Vector256<ulong> PopCount(Vector256<ulong> value);

        public static Vector128<int> PopCount(Vector128<int> value);

        public static Vector128<uint> PopCount(Vector128<uint> value);

        public static Vector128<long> PopCount(Vector128<long> value);

        public static Vector128<ulong> PopCount(Vector128<ulong> value);
    }
}

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512BitAlg : Avx512BW
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512BW.X64
    {
        internal X64() { }
        public static new bool IsSupported { get; }
    }

    public static Vector512<short> PopCount(Vector512<short> value);

    public static Vector512<ushort> PopCount(Vector512<ushort> value);

    public static Vector512<byte> PopCount(Vector512<byte> value);

    public static Vector512<sbyte> PopCount(Vector512<sbyte> value);

    public static Vector512<byte> ShuffleBits(Vector512<ulong> value, Vector512<byte> control);

    public static Vector512<sbyte> ShuffleBits(Vector512<long> value, Vector512<sbyte> control);

    public static Vector512<byte> MaskShuffleBits(Vector512<byte> mask, Vector512<ulong> value, Vector512<byte> control);

    public static Vector512<sbyte> MaskShuffleBits(Vector512<sbyte> mask, Vector512<long> value, Vector512<sbyte> control);

    public abstract class VL : Avx512BW.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<short> PopCount(Vector256<short> value);

        public static Vector256<ushort> PopCount(Vector256<ushort> value);

        public static Vector256<byte> PopCount(Vector256<byte> value);

        public static Vector256<sbyte> PopCount(Vector256<sbyte> value);

        public static Vector128<short> PopCount(Vector128<short> value);

        public static Vector128<ushort> PopCount(Vector128<ushort> value);

        public static Vector128<byte> PopCount(Vector128<byte> value);

        public static Vector128<sbyte> PopCount(Vector128<sbyte> value);
        
        public static Vector256<byte> ShuffleBits(Vector256<ulong> value, Vector256<byte> control);

        public static Vector256<sbyte> ShuffleBits(Vector256<long> value, Vector256<sbyte> control);

        public static Vector256<byte> MaskShuffleBits(Vector256<byte> mask, Vector256<ulong> value, Vector256<byte> control);

        public static Vector256<sbyte> MaskShuffleBits(Vector256<sbyte> mask, Vector256<long> value, Vector256<sbyte> control);

        public static Vector128<byte> ShuffleBits(Vector128<ulong> value, Vector128<byte> control);

        public static Vector128<sbyte> ShuffleBits(Vector128<long> value, Vector128<sbyte> control);

        public static Vector128<byte> MaskShuffleBits(Vector128<byte> mask, Vector128<ulong> value, Vector128<byte> control);

        public static Vector128<sbyte> MaskShuffleBits(Vector128<sbyte> mask, Vector128<long> value, Vector128<sbyte> control);

    }
}

API Usage

var blocksPerRows = Avx512BitAlg.PopCount(board);

Alternative Designs

  • Avx512Vpopcntdq could have a different name.
  • MaskBitShuffle could have a different name and/or parameter/return types (e.g. mask and return type could be ulong instead of Vector512<byte>).

Risks

None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions