diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 74946282005559..7f328f4acb5b7c 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1536,7 +1536,7 @@ class CodeGen final : public CodeGenInterface } }; - OperandDesc genOperandDesc(GenTree* op); + OperandDesc genOperandDesc(instruction ins, GenTree* op); void inst_TT(instruction ins, emitAttr size, GenTree* op1); void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 18c12ee3d8770a..68c43e8183713f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -437,8 +437,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -465,10 +464,9 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - simd16_t val16 = {}; + simd_t val16 = {}; memcpy(&val16, &val12, sizeof(val12)); - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, EA_16BYTE, targetReg); } break; } @@ -495,8 +493,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -523,8 +520,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } @@ -549,8 +545,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } else { - CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + emit->emitSimdConstCompressedLoad(val, attr, targetReg); } break; } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index e4776520ef56e1..2c7d605cbb767a 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8189,35 +8189,142 @@ CORINFO_FIELD_HANDLE emitter::emitSimd16Const(simd16_t constValue) return emitComp->eeFindJitDataOffs(cnum); } -#if defined(TARGET_XARCH) -CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue) +#ifdef TARGET_XARCH +//------------------------------------------------------------------------ +// emitSimdConst: Create a simd data section constant. +// +// Arguments: +// constValue - constant value +// attr - The EA_SIZE for the constant type +// +// Return Value: +// A field handle representing the data offset to access the constant. +// +// Note: +// Access to inline data is 'abstracted' by a special type of static member +// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference +// to constant data, not a real static field. +// +CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) { - unsigned cnsSize = 32; - unsigned cnsAlign = cnsSize; + unsigned cnsSize = EA_SIZE(attr); + unsigned cnsAlign = cnsSize; + var_types dataType = (cnsSize >= 8) ? emitComp->getSIMDTypeForSize(cnsSize) : TYP_FLOAT; +#ifdef TARGET_XARCH if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) { cnsAlign = dataSection::MIN_DATA_ALIGN; } +#endif // TARGET_XARCH - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD32); + UNATIVE_OFFSET cnum = emitDataConst(constValue, cnsSize, cnsAlign, dataType); return emitComp->eeFindJitDataOffs(cnum); } -CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue) +//------------------------------------------------------------------------ +// emitSimdConstCompressedLoad: Create a simd data section constant, +// compressing it if possible, and emit an appropiate instruction +// to load or broadcast the constant to a register. +// +// Arguments: +// constValue - constant value +// attr - The EA_SIZE for the constant type +// targetReg - The target register +// +void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg) { - unsigned cnsSize = 64; - unsigned cnsAlign = cnsSize; + assert(EA_SIZE(attr) >= 8 && EA_SIZE(attr) <= 64); - if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + unsigned cnsSize = EA_SIZE(attr); + unsigned dataSize = cnsSize; + instruction ins = (cnsSize == 8) ? INS_movsd_simd : INS_movups; + + // Most constant vectors tend to have repeated values, so we will first check to see if + // we can replace a full vector load with a smaller broadcast. + + if ((dataSize == 64) && (constValue->v256[1] == constValue->v256[0])) { - cnsAlign = dataSection::MIN_DATA_ALIGN; + assert(emitComp->IsBaselineVector512IsaSupportedDebugOnly()); + dataSize = 32; + ins = INS_vbroadcastf32x8; } - UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64); - return emitComp->eeFindJitDataOffs(cnum); -} + if ((dataSize == 32) && (constValue->v128[1] == constValue->v128[0])) + { + assert(emitComp->IsBaselineVector256IsaSupportedDebugOnly()); + dataSize = 16; + ins = INS_vbroadcastf128; + } + if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0])) + { + if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) || + emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + dataSize = 8; + ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; + } + } + + // `vbroadcastss` fills the full SIMD register, so we can't do this last step if the + // original constant was smaller than a full reg (e.g. TYP_SIMD8) + + if ((dataSize == 8) && (cnsSize >= 16) && (constValue->u32[1] == constValue->u32[0])) + { + if (emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + dataSize = 4; + ins = INS_vbroadcastss; + } + } + + if (dataSize < cnsSize) + { + // We found a broadcast match, so emit the broadcast instruction and return. + // Here we use the original emitAttr for the instruction, because we need to + // produce a register of the original constant's size, filled with the pattern. + + CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, EA_ATTR(dataSize)); + emitIns_R_C(ins, attr, targetReg, hnd, 0); + return; + } + + // Otherwise, if the upper lanes and/or elements of the constant are zero, we can use a + // smaller load, because all scalar and vector memory load instructions zero the uppers. + + simd32_t zeroValue = {}; + + if ((dataSize == 64) && (constValue->v256[1] == zeroValue)) + { + dataSize = 32; + } + + if ((dataSize == 32) && (constValue->v128[1] == zeroValue.v128[0])) + { + dataSize = 16; + } + + if ((dataSize == 16) && (constValue->u64[1] == 0)) + { + dataSize = 8; + ins = INS_movsd_simd; + } + + if ((dataSize == 8) && (constValue->u32[1] == 0)) + { + dataSize = 4; + ins = INS_movss; + } + + // Here we set the emitAttr to the size of the actual load. It will zero extend + // up to the native SIMD register size. + + attr = EA_ATTR(dataSize); + + CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, attr); + emitIns_R_C(ins, attr, targetReg, hnd, 0); +} #endif // TARGET_XARCH #if defined(FEATURE_MASKED_HW_INTRINSICS) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 80ac961a3a6d58..2ce4ff6aff3b1f 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2639,10 +2639,9 @@ class emitter CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); #if defined(TARGET_XARCH) - CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue); - CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue); + CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); + void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); #endif // TARGET_XARCH - #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ef2594be83d116..0514ef0578db43 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -7331,6 +7331,7 @@ bool emitter::IsMovInstruction(instruction ins) case INS_vmovdqu8: case INS_vmovdqu16: case INS_vmovdqu64: + case INS_movq: case INS_movsd_simd: case INS_movss: case INS_movsx: @@ -7350,7 +7351,6 @@ bool emitter::IsMovInstruction(instruction ins) } #if defined(TARGET_AMD64) - case INS_movq: case INS_movsxd: { return true; @@ -7501,7 +7501,6 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } -#if defined(TARGET_AMD64) case INS_movq: { // Clears the upper bits @@ -7509,6 +7508,7 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } +#if defined(TARGET_AMD64) case INS_movsxd: { // Sign-extends the source @@ -7781,13 +7781,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN break; } -#if defined(TARGET_AMD64) case INS_movq: { assert(isFloatReg(dstReg) && isFloatReg(srcReg)); break; } +#if defined(TARGET_AMD64) case INS_movsxd: { assert(isGeneralRegister(dstReg) && isGeneralRegister(srcReg)); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 67d22e9ac47164..0471383840626b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28277,9 +28277,6 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR) { addr = Op(1); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index a00d57962d757b..f7eddafc80f6be 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2078,9 +2078,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: { // These intrinsics have both pointer and vector overloads // We want to be able to differentiate between them so lets diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 19792a61c4083e..ea114113412d81 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1076,7 +1076,7 @@ void CodeGen::genHWIntrinsic_R_RM( instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc rmOpDesc = genOperandDesc(rmOp); + OperandDesc rmOpDesc = genOperandDesc(ins, rmOp); if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (rmOpDesc.GetKind() == OperandKind::Reg)) { @@ -1361,7 +1361,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); if (op2Desc.IsContained()) { @@ -1431,7 +1431,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM(instruction ins, instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op3Desc = genOperandDesc(op3); + OperandDesc op3Desc = genOperandDesc(ins, op3); if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (op3Desc.GetKind() == OperandKind::Reg)) { @@ -1547,7 +1547,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( instOptions = AddEmbBroadcastMode(instOptions); } - OperandDesc op3Desc = genOperandDesc(op3); + OperandDesc op3Desc = genOperandDesc(ins, op3); switch (op3Desc.GetKind()) { @@ -1898,11 +1898,15 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) op1 = loPart; } - ins = INS_movq; baseAttr = EA_8BYTE; } #endif // TARGET_X86 + if (op1->isUsedFromMemory() && (baseAttr == EA_8BYTE)) + { + ins = INS_movq; + } + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, op1, instOptions); } else @@ -1952,7 +1956,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) else { // `movq xmm xmm` zeroes the upper 64 bits. - genHWIntrinsic_R_RM(node, INS_movq, attr, targetReg, op1, instOptions); + emit->emitIns_Mov(INS_movq, attr, targetReg, op1Reg, /* canSkip */ false); } break; } @@ -2281,10 +2285,8 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { minValueInt.i32[i] = INT_MIN; } - CORINFO_FIELD_HANDLE minValueFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(minValueInt.v128[0]) - : emit->emitSimd32Const(minValueInt.v256[0]); - CORINFO_FIELD_HANDLE negOneFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(negOneIntVec.v128[0]) - : emit->emitSimd32Const(negOneIntVec.v256[0]); + CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize); + CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize); // div-by-zero check emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index a49d4b4bdc66bf..1750fed537f1b1 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -843,7 +843,7 @@ HARDWARE_INTRINSIC(AVX2, Blend, HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -915,8 +915,8 @@ HARDWARE_INTRINSIC(AVX512F, And, HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, Compare, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index a0dbcaab36f34a..8610effb416cd6 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -809,6 +809,7 @@ void CodeGen::inst_RV_SH( // logic for determining what "kind" of operand "op" is. // // Arguments: +// ins - The instruction that will consume the operand. // op - The operand node for which to obtain the descriptor. // // Return Value: @@ -818,7 +819,7 @@ void CodeGen::inst_RV_SH( // This method is not idempotent - it can only be called once for a // given node. // -CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) +CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op) { if (!op->isContained() && !op->isUsedFromSpillTemp()) { @@ -915,7 +916,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) { // If the operand of broadcast is not a constant integer, // we handle all the other cases recursively. - return genOperandDesc(hwintrinsicChild); + return genOperandDesc(ins, hwintrinsicChild); } break; } @@ -935,7 +936,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(hwintrinsic->isContained()); op = hwintrinsic->Op(1); - return genOperandDesc(op); + return genOperandDesc(ins, op); } default: @@ -989,59 +990,26 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) #if defined(FEATURE_SIMD) case GT_CNS_VEC: { - switch (op->TypeGet()) - { - case TYP_SIMD8: - { - simd8_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd8_t)); - return OperandDesc(emit->emitSimd8Const(constValue)); - } + insTupleType tupleType = emit->insTupleTypeInfo(ins); + unsigned cnsSize = genTypeSize(op); - case TYP_SIMD12: - { - simd16_t constValue = {}; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd12_t)); - return OperandDesc(emit->emitSimd16Const(constValue)); - } - case TYP_SIMD16: - { - simd16_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd16_t)); - return OperandDesc(emit->emitSimd16Const(constValue)); - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd32_t)); - return OperandDesc(emit->emitSimd32Const(constValue)); - } - - case TYP_SIMD64: - { - simd64_t constValue; - memcpy(&constValue, &op->AsVecCon()->gtSimdVal, sizeof(simd64_t)); - return OperandDesc(emit->emitSimd64Const(constValue)); - } - -#endif // TARGET_XARCH + if ((tupleType == INS_TT_TUPLE1_SCALAR) || (tupleType == INS_TT_TUPLE1_FIXED)) + { + // We have a vector const, but the instruction will only read a scalar from it, + // so don't waste space putting the entire vector to the data section. - default: - { - unreached(); - } + cnsSize = max(CodeGenInterface::instInputSize(ins), 4U); + assert(cnsSize <= genTypeSize(op)); } + + return OperandDesc(emit->emitSimdConst(&op->AsVecCon()->gtSimdVal, EA_TYPE(cnsSize))); } #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - simdmask_t constValue; - memcpy(&constValue, &op->AsMskCon()->gtSimdMaskVal, sizeof(simdmask_t)); - return OperandDesc(emit->emitSimdMaskConst(constValue)); + return OperandDesc(emit->emitSimdMaskConst(op->AsMskCon()->gtSimdMaskVal)); } #endif // FEATURE_MASKED_HW_INTRINSICS @@ -1071,7 +1039,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) void CodeGen::inst_TT(instruction ins, emitAttr size, GenTree* op1) { emitter* emit = GetEmitter(); - OperandDesc op1Desc = genOperandDesc(op1); + OperandDesc op1Desc = genOperandDesc(ins, op1); switch (op1Desc.GetKind()) { @@ -1120,7 +1088,7 @@ void CodeGen::inst_TT(instruction ins, emitAttr size, GenTree* op1) void CodeGen::inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2) { emitter* emit = GetEmitter(); - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { @@ -1202,7 +1170,7 @@ void CodeGen::inst_RV_TT_IV( } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc rmOpDesc = genOperandDesc(rmOp); + OperandDesc rmOpDesc = genOperandDesc(ins, rmOp); switch (rmOpDesc.GetKind()) { @@ -1339,7 +1307,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { @@ -1426,7 +1394,7 @@ void CodeGen::inst_RV_RV_TT_IV(instruction ins, } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(ins, op2); switch (op2Desc.GetKind()) { diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index ace86eb26177d3..1dc31d9fd57eee 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -9610,7 +9610,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing // void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) { -// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from +// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from // unaligned accesses making this optimization questionable. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!comp->opts.OptimizationEnabled()) @@ -10078,7 +10078,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind) #endif // TODO-Cleanup: We're passing isContainable = true but ContainCheckIndir rejects - // address containment in some cases so we end up creating trivial (reg + offfset) + // address containment in some cases so we end up creating trivial (reg + offset) // or (reg + reg) LEAs that are not necessary. #if defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index a9db26b3ee4312..e94b4aba59a494 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -136,7 +136,6 @@ class Lowering final : public Phase void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); - void TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 41f05cafb6d481..2d2de92c801e2b 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -107,30 +107,6 @@ GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) } ContainCheckStoreIndir(node); -#if defined(FEATURE_HW_INTRINSICS) - if (comp->IsBaselineVector512IsaSupportedOpportunistically() || - comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - if (!node->Data()->IsCnsVec()) - { - return node->gtNext; - } - - if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)) - { - return node->gtNext; - } - - if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero()) - { - // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. - return node->gtNext; - } - - TryCompressConstVecData(node); - } -#endif - return node->gtNext; } @@ -4145,7 +4121,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) BlockRange().Remove(node); - return LowerNode(vecCon); + return vecCon->gtNext; } else if (argCnt == 1) { @@ -8921,9 +8897,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: { // These can have either pointer or vector operands. For the pointer case, we can't check // size, so just assume it matches. Otherwise, do normal size check based on tuple type. @@ -9484,83 +9457,6 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, MakeSrcContained(parentNode, childNode); } -//---------------------------------------------------------------------------------------------- -// TryCompressConstVecData: -// Try to compress the constant vector input if it has duplicated parts and can be optimized by -// broadcast -// -// Arguments: -// node - the storeind node. -// -// Return: -// return true if compress success. -void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) -{ - assert(node->Data()->IsCnsVec()); - assert(node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)); - - GenTreeVecCon* vecCon = node->Data()->AsVecCon(); - GenTreeHWIntrinsic* broadcast = nullptr; - - if (vecCon->TypeIs(TYP_SIMD32)) - { - assert(comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)); - if (vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) - { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd32Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd32Val.f64[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); - memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, - NI_AVX2_BroadcastVector128ToVector256, CORINFO_TYPE_UINT, 32); - } - } - else - { - assert(vecCon->TypeIs(TYP_SIMD64)); - assert(comp->IsBaselineVector512IsaSupportedOpportunistically()); - if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) - { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); - memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, - NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); - } - else if (vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) - { - simd32_t simd32Val = {}; - simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; - simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD32); - memcpy(&compressedVecCon->gtSimdVal, &simd32Val, sizeof(simd32_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - broadcast = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector256ToVector512, - CORINFO_TYPE_ULONG, 64); - } - } - - if (broadcast == nullptr) - { - return; - } - - BlockRange().InsertBefore(node, broadcast); - node->Data() = broadcast; - LowerNode(broadcast); -} - //------------------------------------------------------------------------ // TryMakeSrcContainedOrRegOptional: Tries to make "childNode" a contained or regOptional node // @@ -9814,20 +9710,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_AVX2_BroadcastVector128ToVector256: - case NI_AVX512F_BroadcastVector128ToVector512: - case NI_AVX512F_BroadcastVector256ToVector512: - { - if (node->OperIsMemoryLoad()) - { - ContainCheckHWIntrinsicAddr(node, op1, /* conservative maximum */ 32); - return; - } - - assert(op1->IsCnsVec()); - break; - } - case NI_AVX512F_ConvertToVector256Int32: case NI_AVX512F_ConvertToVector256UInt32: case NI_AVX512F_VL_ConvertToVector128UInt32: diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 2f7610b7e6147c..b67b72e83fc88d 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1412,7 +1412,7 @@ void EvaluateWithElementFloating(var_types simdBaseType, TSimd* result, const TS case TYP_DOUBLE: { - result->f64[arg1] = static_cast(arg2); + result->f64[arg1] = arg2; break; } diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 30a66726279ca2..096a9c3a5d6805 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -8040,12 +8040,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, return VNForLongCon(static_cast(result)); } - case NI_Vector128_AsVector2: - { - simd8_t result = GetConstantSimd16(arg0VN).v64[0]; - return VNForSimd8Con(result); - } - case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: { @@ -8100,6 +8094,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, } #endif // TARGET_XARCH + case NI_Vector128_AsVector2: + { + simd8_t result = GetConstantSimd16(arg0VN).v64[0]; + return VNForSimd8Con(result); + } + case NI_Vector128_AsVector3: { simd12_t result = {};