diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index 824c974067184a..d5b12b7d8c752b 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -3432,6 +3432,12 @@ void Compiler::fgDebugCheckFlags(GenTree* tree, BasicBlock* block) expectedFlags |= GTF_GLOB_REF; break; } + + case NI_Vector128_op_Division: + case NI_Vector256_op_Division: + { + break; + } #endif // TARGET_XARCH #if defined(TARGET_ARM64) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 46c57705853a1f..20defd04dbd56c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7269,6 +7269,16 @@ bool GenTree::OperMayThrow(Compiler* comp) { return true; } + +#ifdef TARGET_XARCH + NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId(); + if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division || + intrinsicId == NI_Vector512_op_Division) + { + assert(varTypeIsInt(AsHWIntrinsic()->GetSimdBaseType())); + return true; + } +#endif // TARGET_XARCH } #endif // FEATURE_HW_INTRINSICS @@ -21234,6 +21244,26 @@ GenTree* Compiler::gtNewSimdBinOpNode( } } #endif // TARGET_XARCH +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) + case GT_DIV: + { + if (simdBaseType == TYP_INT) + { + assert(compOpportunisticallyDependsOn(InstructionSet_AVX) || + compOpportunisticallyDependsOn(InstructionSet_AVX512F)); + + assert(simdSize == 16 || simdSize == 32); + + NamedIntrinsic divIntrinsic = simdSize == 16 ? NI_Vector128_op_Division : NI_Vector256_op_Division; + unsigned int divideOpSimdSize = simdSize * 2; + + GenTree* divOp = + gtNewSimdHWIntrinsicNode(op1->TypeGet(), op1, op2, divIntrinsic, simdBaseJitType, divideOpSimdSize); + return divOp; + } + unreached(); + } +#endif // defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) case GT_MUL: { @@ -28156,6 +28186,13 @@ void GenTreeHWIntrinsic::Initialize(NamedIntrinsic intrinsicId) gtFlags |= (GTF_CALL | GTF_GLOB_REF); break; } + + case NI_Vector128_op_Division: + case NI_Vector256_op_Division: + { + gtFlags |= GTF_EXCEPT; + break; + } #endif // TARGET_XARCH #if defined(TARGET_ARM64) @@ -29073,26 +29110,33 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, case GT_DIV: { +#if defined(TARGET_XARCH) + assert(varTypeIsFloating(simdBaseType) || varTypeIsInt(simdBaseType)); +#else assert(varTypeIsFloating(simdBaseType)); +#endif assert(op2->TypeIs(simdType)); #if defined(TARGET_XARCH) - if (simdSize == 64) - { - id = NI_AVX512F_Divide; - } - else if (simdSize == 32) - { - id = NI_AVX_Divide; - } - else if (simdBaseType == TYP_FLOAT) - { - id = isScalar ? NI_SSE_DivideScalar : NI_SSE_Divide; - } - else + if (varTypeIsFloating(simdBaseType)) { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); - id = isScalar ? NI_SSE2_DivideScalar : NI_SSE2_Divide; + if (simdSize == 64) + { + id = NI_AVX512F_Divide; + } + else if (simdSize == 32) + { + id = NI_AVX_Divide; + } + else if (simdBaseType == TYP_FLOAT) + { + id = isScalar ? NI_SSE_DivideScalar : NI_SSE_Divide; + } + else + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + id = isScalar ? NI_SSE2_DivideScalar : NI_SSE2_Divide; + } } #elif defined(TARGET_ARM64) if ((simdSize == 8) && (isScalar || (simdBaseType == TYP_DOUBLE))) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 96d40a9e43b555..4a73297d3d1e18 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1871,6 +1871,72 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) break; } + case NI_Vector128_op_Division: + case NI_Vector256_op_Division: + { + // We can emulate SIMD integer division by converting the 32-bit integer -> 64-bit double, + // perform a 64-bit double divide, then convert back to a 32-bit integer. This is generating + // something similar to the following managed code: + // if (Vector128.EqualsAny(op2, Vector128.Zero)) + // { + // throw new DivideByZeroException(); + // } + // + // Vector128 overflowMask = + // Vector128.Equals(op1, Vector128.Create(int.MinValue) + // & Vector128.Equals(op2, Vector128.Create(-1)); + // if (!Vector128.EqualsAll(overflowMask, Vector128.Zero)) + // { + // throw new OverflowException(); + // } + // + // Vector256 op1_f64 = + // Vector256.ConvertToDouble(Vector256.WidenLower(Vector128.ToVector256Unsafe(op1)))); + // Vector256 op2_f64 = + // Vector256.ConvertToDouble(Vector256.WidenLower(Vector128.ToVector256Unsafe(op2)))); + // Vector256 div_f64 = op1_f64 / op2_f64; + // Vector256 div_i64 = Vector256.ConvertToInt64(div_f64); + // Vector128 div_i32 = Vector256.Narrow(div_i64.GetLower(), div_i64.GetUpper()); + // return div_i32; + regNumber op2Reg = op2->GetRegNum(); + regNumber tmpReg1 = internalRegisters.Extract(node, RBM_ALLFLOAT); + regNumber tmpReg2 = internalRegisters.Extract(node, RBM_ALLFLOAT); + emitAttr typeSize = emitTypeSize(node->TypeGet()); + noway_assert(typeSize == EA_16BYTE || typeSize == EA_32BYTE); + emitAttr divTypeSize = typeSize == EA_16BYTE ? EA_32BYTE : EA_64BYTE; + + simd_t negOneIntVec = simd_t::AllBitsSet(); + simd_t minValueInt{}; + int numElements = genTypeSize(node->TypeGet()) / 4; + for (int i = 0; i < numElements; i++) + { + minValueInt.i32[i] = INT_MIN; + } + CORINFO_FIELD_HANDLE minValueFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(minValueInt.v128[0]) + : emit->emitSimd32Const(minValueInt.v256[0]); + CORINFO_FIELD_HANDLE negOneFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(negOneIntVec.v128[0]) + : emit->emitSimd32Const(negOneIntVec.v256[0]); + + // div-by-zero check + emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions); + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg1, tmpReg1, op2Reg, instOptions); + emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions); + genJumpToThrowHlpBlk(EJ_jne, SCK_DIV_BY_ZERO); + + // overflow check + emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg1, op1Reg, minValueFld, 0, instOptions); + emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op2Reg, negOneFld, 0, instOptions); + emit->emitIns_SIMD_R_R_R(INS_pand, typeSize, tmpReg1, tmpReg1, tmpReg2, instOptions); + emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions); + genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW); + + emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions); + emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op2Reg, instOptions); + emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, targetReg, tmpReg1, tmpReg2, instOptions); + emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, targetReg, targetReg, instOptions); + break; + } + default: { unreached(); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 5d27ce50f49d9d..339a3d971ad6c4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -129,7 +129,7 @@ HARDWARE_INTRINSIC(Vector128, get_Zero, HARDWARE_INTRINSIC(Vector128, op_Addition, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_BitwiseAnd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_BitwiseOr, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(Vector128, op_Division, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector128, op_Division, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialSideEffect_Other|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(Vector128, op_ExclusiveOr, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_Inequality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) @@ -249,7 +249,7 @@ HARDWARE_INTRINSIC(Vector256, get_Zero, HARDWARE_INTRINSIC(Vector256, op_Addition, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, op_BitwiseAnd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, op_BitwiseOr, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, op_Division, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector256, op_Division, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialSideEffect_Other|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector256, op_Equality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(Vector256, op_ExclusiveOr, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, op_Inequality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 838aafa85ae539..ca49ff606cbea1 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2417,8 +2417,19 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (!varTypeIsFloating(simdBaseType)) { - // We can't trivially handle division for integral types using SIMD +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) + // Check to see if it is possible to emulate the integer division + if (!(simdBaseType == TYP_INT && + ((simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX)) || + (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512F))))) + { + break; + } + impSpillSideEffect(true, stackState.esStackDepth - + 2 DEBUGARG("Spilling op1 side effects for vector integer division")); +#else break; +#endif // defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) } CORINFO_ARG_LIST_HANDLE arg1 = sig->args; @@ -2433,6 +2444,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = getArgForHWIntrinsic(argType, argClass); retNode = gtNewSimdBinOpNode(GT_DIV, retType, op1, op2, simdBaseJitType, simdSize); + break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f3d6ad039bcf17..9b5a2af2c3cce1 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -10422,6 +10422,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Vector128_op_Division: + case NI_Vector256_op_Division: + { + break; + } + default: { assert(!"Unhandled containment for helper binary hardware intrinsic"); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 294f67e612fbb3..0277b660b27134 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2786,6 +2786,23 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } + case NI_Vector128_op_Division: + case NI_Vector256_op_Division: + { + srcCount = BuildOperandUses(op1, lowSIMDRegs()); + srcCount += BuildOperandUses(op2, lowSIMDRegs()); + + // get a tmp register for div-by-zero check + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); + + // get a tmp register for overflow check + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); + setInternalRegsDelayFree = true; + + buildUses = false; + break; + } + default: { assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 43b2d527c14795..1b91e2b08e432e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -11383,6 +11383,14 @@ GenTree* Compiler::fgMorphHWIntrinsic(GenTreeHWIntrinsic* tree) tree->AddAllEffectsFlags(operand); } +#ifdef TARGET_XARCH + if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division) + { + fgAddCodeRef(compCurBB, SCK_DIV_BY_ZERO); + fgAddCodeRef(compCurBB, SCK_OVERFLOW); + } +#endif // TARGET_XARCH + if (opts.OptimizationEnabled()) { var_types retType = tree->TypeGet(); diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp index 0a4b0a19443163..69eea04cc1c967 100644 --- a/src/coreclr/jit/stacklevelsetter.cpp +++ b/src/coreclr/jit/stacklevelsetter.cpp @@ -206,6 +206,20 @@ void StackLevelSetter::SetThrowHelperBlocks(GenTree* node, BasicBlock* block) } break; +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) + case GT_HWINTRINSIC: + { + + NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->GetHWIntrinsicId(); + if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division) + { + SetThrowHelperBlock(SCK_DIV_BY_ZERO, block); + SetThrowHelperBlock(SCK_OVERFLOW, block); + } + } + break; +#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) + case GT_INDEX_ADDR: case GT_ARR_ELEM: SetThrowHelperBlock(SCK_RNGCHK_FAIL, block); diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.Divide.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.Divide.cs index 3ab2b1b89239d6..cffc38f016db4a 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.Divide.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.Divide.cs @@ -1,6 +1,5 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. - using System.Runtime.Intrinsics; namespace System.Numerics.Tensors @@ -70,7 +69,8 @@ public static void Divide(T x, ReadOnlySpan y, Span destination) internal readonly struct DivideOperator : IBinaryOperator where T : IDivisionOperators { public static bool Vectorizable => typeof(T) == typeof(float) - || typeof(T) == typeof(double); + || typeof(T) == typeof(double) + || (Vector256.IsHardwareAccelerated && typeof(T) == typeof(int)); public static T Invoke(T x, T y) => x / y; public static Vector128 Invoke(Vector128 x, Vector128 y) => x / y; public static Vector256 Invoke(Vector256 x, Vector256 y) => x / y;